From 54eb4e495486cd4e6d1e09a92f09204a0d069184 Mon Sep 17 00:00:00 2001 From: ShuNing Date: Tue, 19 Sep 2023 05:25:40 +0800 Subject: [PATCH 01/14] resource_control: supports dynamically change the controller config (#7042) close tikv/pd#7043 resource_control: supports dynamically change the controller config - supports dynamically changing the controller config - export the `maxWaitDuration` for the local bucket limiter. Signed-off-by: nolouch Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- client/go.mod | 1 + client/go.sum | 1 + client/resource_group/controller/config.go | 44 +++---- .../resource_group/controller/controller.go | 123 +++++++++++++----- client/resource_group/controller/util.go | 68 ++++++++++ client/resource_group/controller/util_test.go | 51 ++++++++ client/resource_manager_client.go | 10 +- pkg/mcs/resourcemanager/server/apis/v1/api.go | 45 +++++++ pkg/mcs/resourcemanager/server/config.go | 6 + pkg/mcs/resourcemanager/server/config_test.go | 2 + pkg/mcs/resourcemanager/server/manager.go | 72 +++++++++- pkg/storage/endpoint/resource_group.go | 6 + server/server.go | 5 +- .../resourcemanager/resource_manager_test.go | 104 +++++++++++++++ 14 files changed, 473 insertions(+), 65 deletions(-) create mode 100644 client/resource_group/controller/util.go create mode 100644 client/resource_group/controller/util_test.go diff --git a/client/go.mod b/client/go.mod index 9eb066d0fcc..099bcf86296 100644 --- a/client/go.mod +++ b/client/go.mod @@ -3,6 +3,7 @@ module github.com/tikv/pd/client go 1.21 require ( + github.com/BurntSushi/toml v0.3.1 github.com/cloudfoundry/gosigar v1.3.6 github.com/gogo/protobuf v1.3.2 github.com/opentracing/opentracing-go v1.2.0 diff --git a/client/go.sum b/client/go.sum index 33ba3254d53..9261ab4a999 100644 --- a/client/go.sum +++ b/client/go.sum @@ -1,4 +1,5 @@ cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= diff --git a/client/resource_group/controller/config.go b/client/resource_group/controller/config.go index 2095bc60601..16a2525cd0d 100644 --- a/client/resource_group/controller/config.go +++ b/client/resource_group/controller/config.go @@ -51,7 +51,7 @@ const ( // According to the resource control Grafana panel and Prometheus sampling period, the period should be the factor of 15. defaultTargetPeriod = 5 * time.Second // defaultMaxWaitDuration is the max duration to wait for the token before throwing error. - defaultMaxWaitDuration = time.Second + defaultMaxWaitDuration = 30 * time.Second ) const ( @@ -73,14 +73,17 @@ const ( // Because the resource manager has not been deployed in microservice mode, // do not enable this function. - defaultDegradedModeWaitDuration = "0s" + defaultDegradedModeWaitDuration = 0 defaultAvgBatchProportion = 0.7 ) // Config is the configuration of the resource manager controller which includes some option for client needed. type Config struct { // EnableDegradedMode is to control whether resource control client enable degraded mode when server is disconnect. - DegradedModeWaitDuration string `toml:"degraded-mode-wait-duration" json:"degraded-mode-wait-duration"` + DegradedModeWaitDuration Duration `toml:"degraded-mode-wait-duration" json:"degraded-mode-wait-duration"` + + // LTBMaxWaitDuration is the max wait time duration for local token bucket. + LTBMaxWaitDuration Duration `toml:"ltb-max-wait-duration" json:"ltb-max-wait-duration"` // RequestUnit is the configuration determines the coefficients of the RRU and WRU cost. // This configuration should be modified carefully. @@ -90,7 +93,8 @@ type Config struct { // DefaultConfig returns the default resource manager controller configuration. func DefaultConfig() *Config { return &Config{ - DegradedModeWaitDuration: defaultDegradedModeWaitDuration, + DegradedModeWaitDuration: NewDuration(defaultDegradedModeWaitDuration), + LTBMaxWaitDuration: NewDuration(defaultMaxWaitDuration), RequestUnit: DefaultRequestUnitConfig(), } } @@ -143,8 +147,10 @@ type RUConfig struct { WriteBytesCost RequestUnit CPUMsCost RequestUnit // The CPU statistics need to distinguish between different environments. - isSingleGroupByKeyspace bool - maxWaitDuration time.Duration + isSingleGroupByKeyspace bool + + // some config for client + LTBMaxWaitDuration time.Duration DegradedModeWaitDuration time.Duration } @@ -157,21 +163,15 @@ func DefaultRUConfig() *RUConfig { // GenerateRUConfig generates the configuration by the given request unit configuration. func GenerateRUConfig(config *Config) *RUConfig { - cfg := &RUConfig{ - ReadBaseCost: RequestUnit(config.RequestUnit.ReadBaseCost), - ReadPerBatchBaseCost: RequestUnit(config.RequestUnit.ReadPerBatchBaseCost), - ReadBytesCost: RequestUnit(config.RequestUnit.ReadCostPerByte), - WriteBaseCost: RequestUnit(config.RequestUnit.WriteBaseCost), - WritePerBatchBaseCost: RequestUnit(config.RequestUnit.WritePerBatchBaseCost), - WriteBytesCost: RequestUnit(config.RequestUnit.WriteCostPerByte), - CPUMsCost: RequestUnit(config.RequestUnit.CPUMsCost), - maxWaitDuration: defaultMaxWaitDuration, - } - duration, err := time.ParseDuration(config.DegradedModeWaitDuration) - if err != nil { - cfg.DegradedModeWaitDuration, _ = time.ParseDuration(defaultDegradedModeWaitDuration) - } else { - cfg.DegradedModeWaitDuration = duration + return &RUConfig{ + ReadBaseCost: RequestUnit(config.RequestUnit.ReadBaseCost), + ReadPerBatchBaseCost: RequestUnit(config.RequestUnit.ReadPerBatchBaseCost), + ReadBytesCost: RequestUnit(config.RequestUnit.ReadCostPerByte), + WriteBaseCost: RequestUnit(config.RequestUnit.WriteBaseCost), + WritePerBatchBaseCost: RequestUnit(config.RequestUnit.WritePerBatchBaseCost), + WriteBytesCost: RequestUnit(config.RequestUnit.WriteCostPerByte), + CPUMsCost: RequestUnit(config.RequestUnit.CPUMsCost), + LTBMaxWaitDuration: config.LTBMaxWaitDuration.Duration, + DegradedModeWaitDuration: config.DegradedModeWaitDuration.Duration, } - return cfg } diff --git a/client/resource_group/controller/controller.go b/client/resource_group/controller/controller.go index c79bfec1e56..528369df229 100755 --- a/client/resource_group/controller/controller.go +++ b/client/resource_group/controller/controller.go @@ -71,9 +71,11 @@ type ResourceGroupProvider interface { ModifyResourceGroup(ctx context.Context, metaGroup *rmpb.ResourceGroup) (string, error) DeleteResourceGroup(ctx context.Context, resourceGroupName string) (string, error) AcquireTokenBuckets(ctx context.Context, request *rmpb.TokenBucketsRequest) ([]*rmpb.TokenBucketResponse, error) - LoadGlobalConfig(ctx context.Context, names []string, configPath string) ([]pd.GlobalConfigItem, int64, error) LoadResourceGroups(ctx context.Context) ([]*rmpb.ResourceGroup, int64, error) + + // meta storage client Watch(ctx context.Context, key []byte, opts ...pd.OpOption) (chan []*meta_storagepb.Event, error) + Get(ctx context.Context, key []byte, opts ...pd.OpOption) (*meta_storagepb.GetResponse, error) } // ResourceControlCreateOption create a ResourceGroupsController with the optional settings. @@ -89,7 +91,7 @@ func EnableSingleGroupByKeyspace() ResourceControlCreateOption { // WithMaxWaitDuration is the option to set the max wait duration for acquiring token buckets. func WithMaxWaitDuration(d time.Duration) ResourceControlCreateOption { return func(controller *ResourceGroupsController) { - controller.ruConfig.maxWaitDuration = d + controller.ruConfig.LTBMaxWaitDuration = d } } @@ -122,6 +124,11 @@ type ResourceGroupsController struct { // Currently, we don't do multiple `AcquireTokenBuckets`` at the same time, so there are no concurrency problems with `currentRequests`. currentRequests []*rmpb.TokenBucketRequest } + + opts []ResourceControlCreateOption + + // a cache for ru config and make concurrency safe. + safeRuConfig atomic.Pointer[RUConfig] } // NewResourceGroupController returns a new ResourceGroupsController which impls ResourceGroupKVInterceptor @@ -139,7 +146,7 @@ func NewResourceGroupController( if requestUnitConfig != nil { config.RequestUnit = *requestUnitConfig } - log.Info("load resource controller config", zap.Reflect("config", config)) + ruConfig := GenerateRUConfig(config) controller := &ResourceGroupsController{ clientUniqueID: clientUniqueID, @@ -148,34 +155,37 @@ func NewResourceGroupController( lowTokenNotifyChan: make(chan struct{}, 1), tokenResponseChan: make(chan []*rmpb.TokenBucketResponse, 1), tokenBucketUpdateChan: make(chan *groupCostController, maxNotificationChanLen), + opts: opts, } for _, opt := range opts { opt(controller) } + log.Info("load resource controller config", zap.Reflect("config", config), zap.Reflect("ru-config", controller.ruConfig)) controller.calculators = []ResourceCalculator{newKVCalculator(controller.ruConfig), newSQLCalculator(controller.ruConfig)} + controller.safeRuConfig.Store(controller.ruConfig) return controller, nil } func loadServerConfig(ctx context.Context, provider ResourceGroupProvider) (*Config, error) { - items, _, err := provider.LoadGlobalConfig(ctx, nil, controllerConfigPath) + resp, err := provider.Get(ctx, []byte(controllerConfigPath)) if err != nil { return nil, err } - if len(items) == 0 { + if len(resp.Kvs) == 0 { log.Warn("[resource group controller] server does not save config, load config failed") return DefaultConfig(), nil } config := &Config{} - err = json.Unmarshal(items[0].PayLoad, config) + err = json.Unmarshal(resp.Kvs[0].GetValue(), config) if err != nil { return nil, err } return config, nil } -// GetConfig returns the config of controller. It's only used for test. +// GetConfig returns the config of controller. func (c *ResourceGroupsController) GetConfig() *RUConfig { - return c.ruConfig + return c.safeRuConfig.Load() } // Source List @@ -213,22 +223,63 @@ func (c *ResourceGroupsController) Start(ctx context.Context) { stateUpdateTicker = time.NewTicker(time.Millisecond * 100) }) - _, revision, err := c.provider.LoadResourceGroups(ctx) + _, metaRevision, err := c.provider.LoadResourceGroups(ctx) + if err != nil { + log.Warn("load resource group revision failed", zap.Error(err)) + } + resp, err := c.provider.Get(ctx, []byte(controllerConfigPath)) if err != nil { log.Warn("load resource group revision failed", zap.Error(err)) } - var watchChannel chan []*meta_storagepb.Event + cfgRevision := resp.GetHeader().GetRevision() + var watchMetaChannel, watchConfigChannel chan []*meta_storagepb.Event if !c.ruConfig.isSingleGroupByKeyspace { - watchChannel, err = c.provider.Watch(ctx, pd.GroupSettingsPathPrefixBytes, pd.WithRev(revision), pd.WithPrefix()) + watchMetaChannel, err = c.provider.Watch(ctx, pd.GroupSettingsPathPrefixBytes, pd.WithRev(metaRevision), pd.WithPrefix()) + if err != nil { + log.Warn("watch resource group meta failed", zap.Error(err)) + } } - watchRetryTimer := time.NewTimer(watchRetryInterval) - if err == nil || c.ruConfig.isSingleGroupByKeyspace { - watchRetryTimer.Stop() + + watchConfigChannel, err = c.provider.Watch(ctx, pd.ControllerConfigPathPrefixBytes, pd.WithRev(cfgRevision), pd.WithPrefix()) + if err != nil { + log.Warn("watch resource group config failed", zap.Error(err)) } + watchRetryTimer := time.NewTimer(watchRetryInterval) defer watchRetryTimer.Stop() for { select { + /* tickers */ + case <-cleanupTicker.C: + c.cleanUpResourceGroup() + case <-stateUpdateTicker.C: + c.executeOnAllGroups((*groupCostController).updateRunState) + c.executeOnAllGroups((*groupCostController).updateAvgRequestResourcePerSec) + if len(c.run.currentRequests) == 0 { + c.collectTokenBucketRequests(c.loopCtx, FromPeriodReport, periodicReport /* select resource groups which should be reported periodically */) + } + case <-watchRetryTimer.C: + if !c.ruConfig.isSingleGroupByKeyspace && watchMetaChannel == nil { + watchMetaChannel, err = c.provider.Watch(ctx, pd.GroupSettingsPathPrefixBytes, pd.WithRev(metaRevision), pd.WithPrefix()) + if err != nil { + log.Warn("watch resource group meta failed", zap.Error(err)) + watchRetryTimer.Reset(watchRetryInterval) + failpoint.Inject("watchStreamError", func() { + watchRetryTimer.Reset(20 * time.Millisecond) + }) + } + } + if watchConfigChannel == nil { + watchConfigChannel, err = c.provider.Watch(ctx, pd.ControllerConfigPathPrefixBytes, pd.WithRev(cfgRevision), pd.WithPrefix()) + if err != nil { + log.Warn("watch resource group config failed", zap.Error(err)) + watchRetryTimer.Reset(watchRetryInterval) + } + } + + case <-emergencyTokenAcquisitionTicker.C: + c.executeOnAllGroups((*groupCostController).resetEmergencyTokenAcquisition) + /* channels */ case <-c.loopCtx.Done(): resourceGroupStatusGauge.Reset() return @@ -242,14 +293,6 @@ func (c *ResourceGroupsController) Start(ctx context.Context) { c.handleTokenBucketResponse(resp) } c.run.currentRequests = nil - case <-cleanupTicker.C: - c.cleanUpResourceGroup() - case <-stateUpdateTicker.C: - c.executeOnAllGroups((*groupCostController).updateRunState) - c.executeOnAllGroups((*groupCostController).updateAvgRequestResourcePerSec) - if len(c.run.currentRequests) == 0 { - c.collectTokenBucketRequests(c.loopCtx, FromPeriodReport, periodicReport /* select resource groups which should be reported periodically */) - } case <-c.lowTokenNotifyChan: c.executeOnAllGroups((*groupCostController).updateRunState) c.executeOnAllGroups((*groupCostController).updateAvgRequestResourcePerSec) @@ -259,16 +302,14 @@ func (c *ResourceGroupsController) Start(ctx context.Context) { if c.run.inDegradedMode { c.executeOnAllGroups((*groupCostController).applyDegradedMode) } - case <-emergencyTokenAcquisitionTicker.C: - c.executeOnAllGroups((*groupCostController).resetEmergencyTokenAcquisition) - case resp, ok := <-watchChannel: + case resp, ok := <-watchMetaChannel: failpoint.Inject("disableWatch", func() { if c.ruConfig.isSingleGroupByKeyspace { panic("disableWatch") } }) if !ok { - watchChannel = nil + watchMetaChannel = nil watchRetryTimer.Reset(watchRetryInterval) failpoint.Inject("watchStreamError", func() { watchRetryTimer.Reset(20 * time.Millisecond) @@ -276,7 +317,7 @@ func (c *ResourceGroupsController) Start(ctx context.Context) { continue } for _, item := range resp { - revision = item.Kv.ModRevision + metaRevision = item.Kv.ModRevision group := &rmpb.ResourceGroup{} if err := proto.Unmarshal(item.Kv.Value, group); err != nil { continue @@ -293,14 +334,32 @@ func (c *ResourceGroupsController) Start(ctx context.Context) { } } } - case <-watchRetryTimer.C: - watchChannel, err = c.provider.Watch(ctx, pd.GroupSettingsPathPrefixBytes, pd.WithRev(revision), pd.WithPrefix()) - if err != nil { + case resp, ok := <-watchConfigChannel: + if !ok { + watchConfigChannel = nil watchRetryTimer.Reset(watchRetryInterval) failpoint.Inject("watchStreamError", func() { watchRetryTimer.Reset(20 * time.Millisecond) }) + continue + } + for _, item := range resp { + cfgRevision = item.Kv.ModRevision + config := &Config{} + if err := json.Unmarshal(item.Kv.Value, config); err != nil { + continue + } + c.ruConfig = GenerateRUConfig(config) + + // Stay compatible with serverless + for _, opt := range c.opts { + opt(c) + } + copyCfg := *c.ruConfig + c.safeRuConfig.Store(©Cfg) + log.Info("load resource controller config after config changed", zap.Reflect("config", config), zap.Reflect("ruConfig", c.ruConfig)) } + case gc := <-c.tokenBucketUpdateChan: now := gc.run.now go gc.handleTokenBucketUpdateEvent(c.loopCtx, now) @@ -1127,7 +1186,7 @@ func (gc *groupCostController) onRequestWait( res := make([]*Reservation, 0, len(requestResourceLimitTypeList)) for typ, counter := range gc.run.resourceTokens { if v := getRawResourceValueFromConsumption(delta, typ); v > 0 { - res = append(res, counter.limiter.Reserve(ctx, gc.mainCfg.maxWaitDuration, now, v)) + res = append(res, counter.limiter.Reserve(ctx, gc.mainCfg.LTBMaxWaitDuration, now, v)) } } if d, err = WaitReservations(ctx, now, res); err == nil { @@ -1137,7 +1196,7 @@ func (gc *groupCostController) onRequestWait( res := make([]*Reservation, 0, len(requestUnitLimitTypeList)) for typ, counter := range gc.run.requestUnitTokens { if v := getRUValueFromConsumption(delta, typ); v > 0 { - res = append(res, counter.limiter.Reserve(ctx, gc.mainCfg.maxWaitDuration, now, v)) + res = append(res, counter.limiter.Reserve(ctx, gc.mainCfg.LTBMaxWaitDuration, now, v)) } } if d, err = WaitReservations(ctx, now, res); err == nil { diff --git a/client/resource_group/controller/util.go b/client/resource_group/controller/util.go new file mode 100644 index 00000000000..e3450e0ae0d --- /dev/null +++ b/client/resource_group/controller/util.go @@ -0,0 +1,68 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Copyright 2023 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS,g +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package controller + +import ( + "fmt" + "strconv" + "time" + + "github.com/pingcap/errors" +) + +// Duration is a wrapper of time.Duration for TOML and JSON. +type Duration struct { + time.Duration +} + +// NewDuration creates a Duration from time.Duration. +func NewDuration(duration time.Duration) Duration { + return Duration{Duration: duration} +} + +// MarshalJSON returns the duration as a JSON string. +func (d *Duration) MarshalJSON() ([]byte, error) { + return []byte(fmt.Sprintf(`"%s"`, d.String())), nil +} + +// UnmarshalJSON parses a JSON string into the duration. +func (d *Duration) UnmarshalJSON(text []byte) error { + s, err := strconv.Unquote(string(text)) + if err != nil { + return errors.WithStack(err) + } + duration, err := time.ParseDuration(s) + if err != nil { + return errors.WithStack(err) + } + d.Duration = duration + return nil +} + +// UnmarshalText parses a TOML string into the duration. +func (d *Duration) UnmarshalText(text []byte) error { + var err error + d.Duration, err = time.ParseDuration(string(text)) + return errors.WithStack(err) +} + +// MarshalText returns the duration as a JSON string. +func (d Duration) MarshalText() ([]byte, error) { + return []byte(d.String()), nil +} diff --git a/client/resource_group/controller/util_test.go b/client/resource_group/controller/util_test.go new file mode 100644 index 00000000000..b542e6713dc --- /dev/null +++ b/client/resource_group/controller/util_test.go @@ -0,0 +1,51 @@ +// Copyright 2023 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package controller + +import ( + "encoding/json" + "testing" + + "github.com/BurntSushi/toml" + "github.com/stretchr/testify/require" +) + +type example struct { + Interval Duration `json:"interval" toml:"interval"` +} + +func TestDurationJSON(t *testing.T) { + t.Parallel() + re := require.New(t) + example := &example{} + + text := []byte(`{"interval":"1h1m1s"}`) + re.NoError(json.Unmarshal(text, example)) + re.Equal(float64(60*60+60+1), example.Interval.Seconds()) + + b, err := json.Marshal(example) + re.NoError(err) + re.Equal(string(text), string(b)) +} + +func TestDurationTOML(t *testing.T) { + t.Parallel() + re := require.New(t) + example := &example{} + + text := []byte(`interval = "1h1m1s"`) + re.Nil(toml.Unmarshal(text, example)) + re.Equal(float64(60*60+60+1), example.Interval.Seconds()) +} diff --git a/client/resource_manager_client.go b/client/resource_manager_client.go index 61919a2ccb2..68b2de66ae2 100644 --- a/client/resource_manager_client.go +++ b/client/resource_manager_client.go @@ -31,9 +31,10 @@ import ( type actionType int const ( - add actionType = 0 - modify actionType = 1 - groupSettingsPathPrefix = "resource_group/settings" + add actionType = 0 + modify actionType = 1 + groupSettingsPathPrefix = "resource_group/settings" + controllerConfigPathPrefix = "resource_group/controller" // errNotPrimary is returned when the requested server is not primary. errNotPrimary = "not primary" // errNotLeader is returned when the requested server is not pd leader. @@ -43,6 +44,9 @@ const ( // GroupSettingsPathPrefixBytes is used to watch or get resource groups. var GroupSettingsPathPrefixBytes = []byte(groupSettingsPathPrefix) +// ControllerConfigPathPrefixBytes is used to watch or get controller config. +var ControllerConfigPathPrefixBytes = []byte(controllerConfigPathPrefix) + // ResourceManagerClient manages resource group info and token request. type ResourceManagerClient interface { ListResourceGroups(ctx context.Context) ([]*rmpb.ResourceGroup, error) diff --git a/pkg/mcs/resourcemanager/server/apis/v1/api.go b/pkg/mcs/resourcemanager/server/apis/v1/api.go index 7c5e3e010dc..970880788d4 100644 --- a/pkg/mcs/resourcemanager/server/apis/v1/api.go +++ b/pkg/mcs/resourcemanager/server/apis/v1/api.go @@ -16,7 +16,9 @@ package apis import ( "errors" + "fmt" "net/http" + "reflect" "sync" "github.com/gin-contrib/cors" @@ -29,6 +31,7 @@ import ( "github.com/tikv/pd/pkg/mcs/utils" "github.com/tikv/pd/pkg/utils/apiutil" "github.com/tikv/pd/pkg/utils/apiutil/multiservicesapi" + "github.com/tikv/pd/pkg/utils/reflectutil" ) // APIPathPrefix is the prefix of the API path. @@ -97,6 +100,8 @@ func (s *Service) RegisterRouter() { configEndpoint.GET("/group/:name", s.getResourceGroup) configEndpoint.GET("/groups", s.getResourceGroupList) configEndpoint.DELETE("/group/:name", s.deleteResourceGroup) + configEndpoint.GET("/controller", s.getControllerConfig) + configEndpoint.POST("/controller", s.setControllerConfig) } func (s *Service) handler() http.Handler { @@ -191,3 +196,43 @@ func (s *Service) deleteResourceGroup(c *gin.Context) { } c.String(http.StatusOK, "Success!") } + +// GetControllerConfig +// +// @Tags ResourceManager +// @Summary Get the resource controller config. +// @Success 200 {string} json format of rmserver.ControllerConfig +// @Failure 400 {string} error +// @Router /config/controller [GET] +func (s *Service) getControllerConfig(c *gin.Context) { + config := s.manager.GetControllerConfig() + c.IndentedJSON(http.StatusOK, config) +} + +// SetControllerConfig +// +// @Tags ResourceManager +// @Summary Set the resource controller config. +// @Param config body object true "json params, rmserver.ControllerConfig" +// @Success 200 {string} string "Success!" +// @Failure 400 {string} error +// @Router /config/controller [POST] +func (s *Service) setControllerConfig(c *gin.Context) { + conf := make(map[string]interface{}) + if err := c.ShouldBindJSON(&conf); err != nil { + c.String(http.StatusBadRequest, err.Error()) + return + } + for k, v := range conf { + key := reflectutil.FindJSONFullTagByChildTag(reflect.TypeOf(rmserver.ControllerConfig{}), k) + if key == "" { + c.String(http.StatusBadRequest, fmt.Sprintf("config item %s not found", k)) + return + } + if err := s.manager.UpdateControllerConfigItem(key, v); err != nil { + c.String(http.StatusBadRequest, err.Error()) + return + } + } + c.String(http.StatusOK, "Success!") +} diff --git a/pkg/mcs/resourcemanager/server/config.go b/pkg/mcs/resourcemanager/server/config.go index 51fbe388458..3f64b2987fd 100644 --- a/pkg/mcs/resourcemanager/server/config.go +++ b/pkg/mcs/resourcemanager/server/config.go @@ -57,6 +57,8 @@ const ( // Because the resource manager has not been deployed in microservice mode, // do not enable this function. defaultDegradedModeWaitDuration = time.Second * 0 + // defaultMaxWaitDuration is the max duration to wait for the token before throwing error. + defaultMaxWaitDuration = 30 * time.Second ) // Config is the configuration for the resource manager. @@ -94,6 +96,9 @@ type ControllerConfig struct { // EnableDegradedMode is to control whether resource control client enable degraded mode when server is disconnect. DegradedModeWaitDuration typeutil.Duration `toml:"degraded-mode-wait-duration" json:"degraded-mode-wait-duration"` + // LTBMaxWaitDuration is the max wait time duration for local token bucket. + LTBMaxWaitDuration typeutil.Duration `toml:"ltb-max-wait-duration" json:"ltb-max-wait-duration"` + // RequestUnit is the configuration determines the coefficients of the RRU and WRU cost. // This configuration should be modified carefully. RequestUnit RequestUnitConfig `toml:"request-unit" json:"request-unit"` @@ -107,6 +112,7 @@ func (rmc *ControllerConfig) Adjust(meta *configutil.ConfigMetaData) { rmc.RequestUnit.Adjust() configutil.AdjustDuration(&rmc.DegradedModeWaitDuration, defaultDegradedModeWaitDuration) + configutil.AdjustDuration(&rmc.LTBMaxWaitDuration, defaultMaxWaitDuration) failpoint.Inject("enableDegradedMode", func() { configutil.AdjustDuration(&rmc.DegradedModeWaitDuration, time.Second) }) diff --git a/pkg/mcs/resourcemanager/server/config_test.go b/pkg/mcs/resourcemanager/server/config_test.go index c0cac4da9c0..dd8dd2d2814 100644 --- a/pkg/mcs/resourcemanager/server/config_test.go +++ b/pkg/mcs/resourcemanager/server/config_test.go @@ -27,6 +27,7 @@ func TestControllerConfig(t *testing.T) { re := require.New(t) cfgData := ` [controller] +ltb-max-wait-duration = "60s" degraded-mode-wait-duration = "2s" [controller.request-unit] read-base-cost = 1.0 @@ -42,6 +43,7 @@ read-cpu-ms-cost = 5.0 re.NoError(err) re.Equal(cfg.Controller.DegradedModeWaitDuration.Duration, time.Second*2) + re.Equal(cfg.Controller.LTBMaxWaitDuration.Duration, time.Second*60) re.LessOrEqual(math.Abs(cfg.Controller.RequestUnit.CPUMsCost-5), 1e-7) re.LessOrEqual(math.Abs(cfg.Controller.RequestUnit.WriteCostPerByte-4), 1e-7) re.LessOrEqual(math.Abs(cfg.Controller.RequestUnit.WriteBaseCost-3), 1e-7) diff --git a/pkg/mcs/resourcemanager/server/manager.go b/pkg/mcs/resourcemanager/server/manager.go index 6d1b872575b..21866ee1156 100644 --- a/pkg/mcs/resourcemanager/server/manager.go +++ b/pkg/mcs/resourcemanager/server/manager.go @@ -19,10 +19,12 @@ import ( "encoding/json" "math" "sort" + "strings" "sync" "time" "github.com/gogo/protobuf/proto" + "github.com/pingcap/errors" "github.com/pingcap/failpoint" rmpb "github.com/pingcap/kvproto/pkg/resource_manager" "github.com/pingcap/log" @@ -30,6 +32,7 @@ import ( "github.com/tikv/pd/pkg/errs" "github.com/tikv/pd/pkg/storage/endpoint" "github.com/tikv/pd/pkg/storage/kv" + "github.com/tikv/pd/pkg/utils/jsonutil" "github.com/tikv/pd/pkg/utils/logutil" "go.uber.org/zap" ) @@ -102,32 +105,46 @@ func (m *Manager) GetBasicServer() bs.Server { // Init initializes the resource group manager. func (m *Manager) Init(ctx context.Context) error { - // Todo: If we can modify following configs in the future, we should reload these configs. - // Store the controller config into the storage. - m.storage.SaveControllerConfig(m.controllerConfig) + v, err := m.storage.LoadControllerConfig() + if err != nil { + log.Error("resource controller config load failed", zap.Error(err), zap.String("v", v)) + return err + } + if err = json.Unmarshal([]byte(v), &m.controllerConfig); err != nil { + log.Error("un-marshall controller config failed, fallback to default", zap.Error(err), zap.String("v", v)) + } + + // re-save the config to make sure the config has been persisted. + if err := m.storage.SaveControllerConfig(m.controllerConfig); err != nil { + return err + } // Load resource group meta info from storage. m.groups = make(map[string]*ResourceGroup) handler := func(k, v string) { group := &rmpb.ResourceGroup{} if err := proto.Unmarshal([]byte(v), group); err != nil { - log.Error("err", zap.Error(err), zap.String("k", k), zap.String("v", v)) + log.Error("failed to parse the resource group", zap.Error(err), zap.String("k", k), zap.String("v", v)) panic(err) } m.groups[group.Name] = FromProtoResourceGroup(group) } - m.storage.LoadResourceGroupSettings(handler) + if err := m.storage.LoadResourceGroupSettings(handler); err != nil { + return err + } // Load resource group states from storage. tokenHandler := func(k, v string) { tokens := &GroupStates{} if err := json.Unmarshal([]byte(v), tokens); err != nil { - log.Error("err", zap.Error(err), zap.String("k", k), zap.String("v", v)) + log.Error("failed to parse the resource group state", zap.Error(err), zap.String("k", k), zap.String("v", v)) panic(err) } if group, ok := m.groups[k]; ok { group.SetStatesIntoResourceGroup(tokens) } } - m.storage.LoadResourceGroupStates(tokenHandler) + if err := m.storage.LoadResourceGroupStates(tokenHandler); err != nil { + return err + } // Add default group if it's not inited. if _, ok := m.groups[reservedDefaultGroupName]; !ok { @@ -159,6 +176,47 @@ func (m *Manager) Init(ctx context.Context) error { return nil } +// UpdateControllerConfigItem updates the controller config item. +func (m *Manager) UpdateControllerConfigItem(key string, value interface{}) error { + kp := strings.Split(key, ".") + if len(kp) == 0 { + return errors.Errorf("invalid key %s", key) + } + m.Lock() + var config interface{} + switch kp[0] { + case "request-unit": + config = &m.controllerConfig.RequestUnit + default: + config = m.controllerConfig + } + updated, found, err := jsonutil.AddKeyValue(config, kp[len(kp)-1], value) + if err != nil { + m.Unlock() + return err + } + + if !found { + m.Unlock() + return errors.Errorf("config item %s not found", key) + } + m.Unlock() + if updated { + if err := m.storage.SaveControllerConfig(m.controllerConfig); err != nil { + log.Error("save controller config failed", zap.Error(err)) + } + log.Info("updated controller config item", zap.String("key", key), zap.Any("value", value)) + } + return nil +} + +// GetControllerConfig returns the controller config. +func (m *Manager) GetControllerConfig() *ControllerConfig { + m.RLock() + defer m.RUnlock() + return m.controllerConfig +} + // AddResourceGroup puts a resource group. // NOTE: AddResourceGroup should also be idempotent because tidb depends // on this retry mechanism. diff --git a/pkg/storage/endpoint/resource_group.go b/pkg/storage/endpoint/resource_group.go index f1b3feb36aa..150ea77a1c7 100644 --- a/pkg/storage/endpoint/resource_group.go +++ b/pkg/storage/endpoint/resource_group.go @@ -27,6 +27,7 @@ type ResourceGroupStorage interface { SaveResourceGroupStates(name string, obj interface{}) error DeleteResourceGroupStates(name string) error SaveControllerConfig(config interface{}) error + LoadControllerConfig() (string, error) } var _ ResourceGroupStorage = (*StorageEndpoint)(nil) @@ -65,3 +66,8 @@ func (se *StorageEndpoint) LoadResourceGroupStates(f func(k, v string)) error { func (se *StorageEndpoint) SaveControllerConfig(config interface{}) error { return se.saveJSON(controllerConfigPath, config) } + +// LoadControllerConfig loads the resource controller config from storage. +func (se *StorageEndpoint) LoadControllerConfig() (string, error) { + return se.Load(controllerConfigPath) +} diff --git a/server/server.go b/server/server.go index 7c19d8ff7c5..9e72477368d 100644 --- a/server/server.go +++ b/server/server.go @@ -1709,7 +1709,10 @@ func (s *Server) campaignLeader() { log.Info("triggering the leader callback functions") for _, cb := range s.leaderCallbacks { - cb(ctx) + if err := cb(ctx); err != nil { + log.Error("failed to execute leader callback function", errs.ZapError(err)) + return + } } // Try to create raft cluster. diff --git a/tests/integrations/mcs/resourcemanager/resource_manager_test.go b/tests/integrations/mcs/resourcemanager/resource_manager_test.go index 926484cea1e..546339bee0f 100644 --- a/tests/integrations/mcs/resourcemanager/resource_manager_test.go +++ b/tests/integrations/mcs/resourcemanager/resource_manager_test.go @@ -19,6 +19,7 @@ import ( "encoding/json" "fmt" "io" + "math/rand" "net/http" "strconv" "strings" @@ -1265,3 +1266,106 @@ func (suite *resourceManagerClientTestSuite) TestSkipConsumptionForBackgroundJob c.Stop() } + +func (suite *resourceManagerClientTestSuite) TestResourceGroupControllerConfigChanged() { + re := suite.Require() + cli := suite.client + for _, group := range suite.initGroups { + resp, err := cli.AddResourceGroup(suite.ctx, group) + re.NoError(err) + re.Contains(resp, "Success!") + } + c1, err := controller.NewResourceGroupController(suite.ctx, 1, cli, nil) + re.NoError(err) + c1.Start(suite.ctx) + // with client option + c2, err := controller.NewResourceGroupController(suite.ctx, 2, cli, nil, controller.WithMaxWaitDuration(time.Hour)) + re.NoError(err) + c2.Start(suite.ctx) + // helper function for sending HTTP requests and checking responses + sendRequest := func(method, url string, body io.Reader) []byte { + req, err := http.NewRequest(method, url, body) + re.NoError(err) + resp, err := http.DefaultClient.Do(req) + re.NoError(err) + defer resp.Body.Close() + bytes, err := io.ReadAll(resp.Body) + re.NoError(err) + if resp.StatusCode != http.StatusOK { + re.Fail(string(bytes)) + } + return bytes + } + + getAddr := func() string { + server := suite.cluster.GetServer(suite.cluster.GetLeader()) + if rand.Intn(100)%2 == 1 { + server = suite.cluster.GetServer(suite.cluster.GetFollower()) + } + return server.GetAddr() + } + + configURL := "/resource-manager/api/v1/config/controller" + waitDuration := 10 * time.Second + readBaseCost := 1.5 + defaultCfg := controller.DefaultConfig() + // failpoint enableDegradedMode will setup and set it be 1s. + defaultCfg.DegradedModeWaitDuration.Duration = time.Second + expectRUCfg := controller.GenerateRUConfig(defaultCfg) + // initial config verification + respString := sendRequest("GET", getAddr()+configURL, nil) + defaultString, err := json.Marshal(defaultCfg) + re.NoError(err) + re.JSONEq(string(respString), string(defaultString)) + re.EqualValues(expectRUCfg, c1.GetConfig()) + + testCases := []struct { + configJSON string + value interface{} + expected func(ruConfig *controller.RUConfig) + }{ + { + configJSON: fmt.Sprintf(`{"degraded-mode-wait-duration": "%v"}`, waitDuration), + value: waitDuration, + expected: func(ruConfig *controller.RUConfig) { ruConfig.DegradedModeWaitDuration = waitDuration }, + }, + { + configJSON: fmt.Sprintf(`{"ltb-max-wait-duration": "%v"}`, waitDuration), + value: waitDuration, + expected: func(ruConfig *controller.RUConfig) { ruConfig.LTBMaxWaitDuration = waitDuration }, + }, + { + configJSON: fmt.Sprintf(`{"read-base-cost": %v}`, readBaseCost), + value: readBaseCost, + expected: func(ruConfig *controller.RUConfig) { ruConfig.ReadBaseCost = controller.RequestUnit(readBaseCost) }, + }, + { + configJSON: fmt.Sprintf(`{"write-base-cost": %v}`, readBaseCost*2), + value: readBaseCost * 2, + expected: func(ruConfig *controller.RUConfig) { ruConfig.WriteBaseCost = controller.RequestUnit(readBaseCost * 2) }, + }, + { + // reset the degraded-mode-wait-duration to default in test. + configJSON: fmt.Sprintf(`{"degraded-mode-wait-duration": "%v"}`, time.Second), + value: time.Second, + expected: func(ruConfig *controller.RUConfig) { ruConfig.DegradedModeWaitDuration = time.Second }, + }, + } + // change properties one by one and verify each time + for _, t := range testCases { + sendRequest("POST", getAddr()+configURL, strings.NewReader(t.configJSON)) + time.Sleep(500 * time.Millisecond) + t.expected(expectRUCfg) + re.EqualValues(expectRUCfg, c1.GetConfig()) + + expectRUCfg2 := *expectRUCfg + // always apply the client option + expectRUCfg2.LTBMaxWaitDuration = time.Hour + re.EqualValues(&expectRUCfg2, c2.GetConfig()) + } + // restart c1 + c1.Stop() + c1, err = controller.NewResourceGroupController(suite.ctx, 1, cli, nil) + re.NoError(err) + re.EqualValues(expectRUCfg, c1.GetConfig()) +} From 559ea7f247118eec9290d37793a217b116e73ddb Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Tue, 19 Sep 2023 05:45:10 +0800 Subject: [PATCH 02/14] config: enable store limit v2 in raftstore-v2 (#7098) close tikv/pd#7099 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- server/cluster/cluster.go | 2 +- server/cluster/cluster_test.go | 4 ++++ server/config/persist_options.go | 6 ++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 29a8709bdac..1b2bc75e334 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -436,7 +436,7 @@ func (c *RaftCluster) runStoreConfigSync() { for { synced, switchRaftV2Config = c.syncStoreConfig(stores) if switchRaftV2Config { - if err := c.opt.Persist(c.GetStorage()); err != nil { + if err := c.opt.SwitchRaftV2(c.GetStorage()); err != nil { log.Warn("store config persisted failed", zap.Error(err)) } } diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index ea8d27b155f..5679fd6128d 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -1469,6 +1469,10 @@ func TestStoreConfigSync(t *testing.T) { err = opt.Reload(tc.GetStorage()) re.NoError(err) re.Equal(tc.GetOpts().(*config.PersistOptions).GetStoreConfig(), opt.GetStoreConfig()) + + re.Equal("v1", opt.GetScheduleConfig().StoreLimitVersion) + re.NoError(opt.SwitchRaftV2(tc.GetStorage())) + re.Equal("v2", opt.GetScheduleConfig().StoreLimitVersion) } func TestUpdateStorePendingPeerCount(t *testing.T) { diff --git a/server/config/persist_options.go b/server/config/persist_options.go index 1ea0b79424f..3f1c4d4a24e 100644 --- a/server/config/persist_options.go +++ b/server/config/persist_options.go @@ -762,6 +762,12 @@ type persistedConfig struct { StoreConfig sc.StoreConfig `json:"store"` } +// SwitchRaftV2 update some config if tikv raft engine switch into partition raft v2 +func (o *PersistOptions) SwitchRaftV2(storage endpoint.ConfigStorage) error { + o.GetScheduleConfig().StoreLimitVersion = "v2" + return o.Persist(storage) +} + // Persist saves the configuration to the storage. func (o *PersistOptions) Persist(storage endpoint.ConfigStorage) error { cfg := &persistedConfig{ From dff99fd4a1b1569f87e775ba0764b4662cc50a6d Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Tue, 19 Sep 2023 14:31:42 +0800 Subject: [PATCH 03/14] *: decouple operator handler and server (#7089) ref tikv/pd#5839 Signed-off-by: lhy1024 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- errors.toml | 30 ++ pkg/errs/errno.go | 24 ++ pkg/schedule/handler/handler.go | 500 +++++++++++++++++++++++++++++ server/api/hot_status.go | 5 +- server/api/label.go | 3 +- server/api/region.go | 7 +- server/api/rule.go | 2 +- server/api/store.go | 6 +- server/cluster/cluster.go | 4 +- server/grpc_service.go | 4 +- server/handler.go | 545 ++------------------------------ 11 files changed, 600 insertions(+), 530 deletions(-) create mode 100644 pkg/schedule/handler/handler.go diff --git a/errors.toml b/errors.toml index a0040195e5d..6766da79572 100644 --- a/errors.toml +++ b/errors.toml @@ -516,6 +516,36 @@ error = ''' TCP socks error ''' +["PD:operator:ErrAddOperator"] +error = ''' +failed to add operator, maybe already have one +''' + +["PD:operator:ErrOperatorNotFound"] +error = ''' +operator not found +''' + +["PD:operator:ErrPluginNotFound"] +error = ''' +plugin is not found: %s +''' + +["PD:operator:ErrRegionAbnormalPeer"] +error = ''' +region %v has abnormal peer +''' + +["PD:operator:ErrRegionNotAdjacent"] +error = ''' +two regions are not adjacent +''' + +["PD:operator:ErrRegionNotFound"] +error = ''' +region %v not found +''' + ["PD:os:ErrOSOpen"] error = ''' open error diff --git a/pkg/errs/errno.go b/pkg/errs/errno.go index a077751f561..9eedb144f95 100644 --- a/pkg/errs/errno.go +++ b/pkg/errs/errno.go @@ -92,6 +92,30 @@ var ( ErrClientGetMinTSO = errors.Normalize("get min TSO failed, %v", errors.RFCCodeText("PD:client:ErrClientGetMinTSO")) ) +// operator errors +var ( + // ErrOperatorNotFound is error info for operator not found. + ErrOperatorNotFound = errors.Normalize("operator not found", errors.RFCCodeText("PD:operator:ErrOperatorNotFound")) + // ErrAddOperator is error info for already have an operator when adding operator. + ErrAddOperator = errors.Normalize("failed to add operator, maybe already have one", errors.RFCCodeText("PD:operator:ErrAddOperator")) +) + +// region errors +var ( + // ErrRegionNotAdjacent is error info for region not adjacent. + ErrRegionNotAdjacent = errors.Normalize("two regions are not adjacent", errors.RFCCodeText("PD:operator:ErrRegionNotAdjacent")) + // ErrRegionNotFound is error info for region not found. + ErrRegionNotFound = errors.Normalize("region %v not found", errors.RFCCodeText("PD:operator:ErrRegionNotFound")) + // ErrRegionAbnormalPeer is error info for region has abnormal peer. + ErrRegionAbnormalPeer = errors.Normalize("region %v has abnormal peer", errors.RFCCodeText("PD:operator:ErrRegionAbnormalPeer")) +) + +// plugin errors +var ( + // ErrPluginNotFound is error info for plugin not found. + ErrPluginNotFound = errors.Normalize("plugin is not found: %s", errors.RFCCodeText("PD:operator:ErrPluginNotFound")) +) + // schedule errors var ( ErrUnexpectedOperatorStatus = errors.Normalize("operator with unexpected status", errors.RFCCodeText("PD:schedule:ErrUnexpectedOperatorStatus")) diff --git a/pkg/schedule/handler/handler.go b/pkg/schedule/handler/handler.go new file mode 100644 index 00000000000..d48941726d0 --- /dev/null +++ b/pkg/schedule/handler/handler.go @@ -0,0 +1,500 @@ +// Copyright 2023 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package handler + +import ( + "bytes" + "encoding/hex" + "strings" + "time" + + "github.com/pingcap/errors" + "github.com/pingcap/kvproto/pkg/metapb" + "github.com/pingcap/kvproto/pkg/pdpb" + "github.com/pingcap/log" + "github.com/tikv/pd/pkg/core" + "github.com/tikv/pd/pkg/errs" + "github.com/tikv/pd/pkg/schedule" + sche "github.com/tikv/pd/pkg/schedule/core" + "github.com/tikv/pd/pkg/schedule/filter" + "github.com/tikv/pd/pkg/schedule/operator" + "github.com/tikv/pd/pkg/schedule/placement" + "github.com/tikv/pd/pkg/schedule/scatter" +) + +// Server is the interface for handler about schedule. +// TODO: remove it after GetCluster is unified between PD server and Scheduling server. +type Server interface { + GetCoordinator() *schedule.Coordinator + GetCluster() sche.SharedCluster +} + +// Handler is a handler to handle http request about schedule. +type Handler struct { + Server +} + +// NewHandler creates a new handler. +func NewHandler(server Server) *Handler { + return &Handler{ + Server: server, + } +} + +// GetOperatorController returns OperatorController. +func (h *Handler) GetOperatorController() (*operator.Controller, error) { + co := h.GetCoordinator() + if co == nil { + return nil, errs.ErrNotBootstrapped.GenWithStackByArgs() + } + return co.GetOperatorController(), nil +} + +// GetRegionScatterer returns RegionScatterer. +func (h *Handler) GetRegionScatterer() (*scatter.RegionScatterer, error) { + co := h.GetCoordinator() + if co == nil { + return nil, errs.ErrNotBootstrapped.GenWithStackByArgs() + } + return co.GetRegionScatterer(), nil +} + +// GetOperator returns the region operator. +func (h *Handler) GetOperator(regionID uint64) (*operator.Operator, error) { + c, err := h.GetOperatorController() + if err != nil { + return nil, err + } + + op := c.GetOperator(regionID) + if op == nil { + return nil, errs.ErrOperatorNotFound + } + + return op, nil +} + +// GetOperatorStatus returns the status of the region operator. +func (h *Handler) GetOperatorStatus(regionID uint64) (*operator.OpWithStatus, error) { + c, err := h.GetOperatorController() + if err != nil { + return nil, err + } + + op := c.GetOperatorStatus(regionID) + if op == nil { + return nil, errs.ErrOperatorNotFound + } + + return op, nil +} + +// RemoveOperator removes the region operator. +func (h *Handler) RemoveOperator(regionID uint64) error { + c, err := h.GetOperatorController() + if err != nil { + return err + } + + op := c.GetOperator(regionID) + if op == nil { + return errs.ErrOperatorNotFound + } + + _ = c.RemoveOperator(op, operator.AdminStop) + return nil +} + +// GetOperators returns the running operators. +func (h *Handler) GetOperators() ([]*operator.Operator, error) { + c, err := h.GetOperatorController() + if err != nil { + return nil, err + } + return c.GetOperators(), nil +} + +// GetWaitingOperators returns the waiting operators. +func (h *Handler) GetWaitingOperators() ([]*operator.Operator, error) { + c, err := h.GetOperatorController() + if err != nil { + return nil, err + } + return c.GetWaitingOperators(), nil +} + +// GetAdminOperators returns the running admin operators. +func (h *Handler) GetAdminOperators() ([]*operator.Operator, error) { + c, err := h.GetOperatorController() + if err != nil { + return nil, err + } + return c.GetOperatorsOfKind(operator.OpAdmin), nil +} + +// GetLeaderOperators returns the running leader operators. +func (h *Handler) GetLeaderOperators() ([]*operator.Operator, error) { + c, err := h.GetOperatorController() + if err != nil { + return nil, err + } + return c.GetOperatorsOfKind(operator.OpLeader), nil +} + +// GetRegionOperators returns the running region operators. +func (h *Handler) GetRegionOperators() ([]*operator.Operator, error) { + c, err := h.GetOperatorController() + if err != nil { + return nil, err + } + return c.GetOperatorsOfKind(operator.OpRegion), nil +} + +// GetHistory returns finished operators' history since start. +func (h *Handler) GetHistory(start time.Time) ([]operator.OpHistory, error) { + c, err := h.GetOperatorController() + if err != nil { + return nil, err + } + return c.GetHistory(start), nil +} + +// GetRecords returns finished operators since start. +func (h *Handler) GetRecords(from time.Time) ([]*operator.OpRecord, error) { + c, err := h.GetOperatorController() + if err != nil { + return nil, err + } + records := c.GetRecords(from) + if len(records) == 0 { + return nil, errs.ErrOperatorNotFound + } + return records, nil +} + +// AddTransferLeaderOperator adds an operator to transfer leader to the store. +func (h *Handler) AddTransferLeaderOperator(regionID uint64, storeID uint64) error { + c := h.GetCluster() + region := c.GetRegion(regionID) + if region == nil { + return errs.ErrRegionNotFound.FastGenByArgs(regionID) + } + + newLeader := region.GetStoreVoter(storeID) + if newLeader == nil { + return errors.Errorf("region has no voter in store %v", storeID) + } + + op, err := operator.CreateTransferLeaderOperator("admin-transfer-leader", c, region, region.GetLeader().GetStoreId(), newLeader.GetStoreId(), []uint64{}, operator.OpAdmin) + if err != nil { + log.Debug("fail to create transfer leader operator", errs.ZapError(err)) + return err + } + return h.addOperator(op) +} + +// AddTransferRegionOperator adds an operator to transfer region to the stores. +func (h *Handler) AddTransferRegionOperator(regionID uint64, storeIDs map[uint64]placement.PeerRoleType) error { + c := h.GetCluster() + region := c.GetRegion(regionID) + if region == nil { + return errs.ErrRegionNotFound.FastGenByArgs(regionID) + } + + if c.GetSharedConfig().IsPlacementRulesEnabled() { + // Cannot determine role without peer role when placement rules enabled. Not supported now. + for _, role := range storeIDs { + if len(role) == 0 { + return errors.New("transfer region without peer role is not supported when placement rules enabled") + } + } + } + for id := range storeIDs { + if err := checkStoreState(c, id); err != nil { + return err + } + } + + roles := make(map[uint64]placement.PeerRoleType) + for id, peerRole := range storeIDs { + if peerRole == "" { + peerRole = placement.Voter + } + roles[id] = peerRole + } + op, err := operator.CreateMoveRegionOperator("admin-move-region", c, region, operator.OpAdmin, roles) + if err != nil { + log.Debug("fail to create move region operator", errs.ZapError(err)) + return err + } + return h.addOperator(op) +} + +// AddTransferPeerOperator adds an operator to transfer peer. +func (h *Handler) AddTransferPeerOperator(regionID uint64, fromStoreID, toStoreID uint64) error { + c := h.GetCluster() + region := c.GetRegion(regionID) + if region == nil { + return errs.ErrRegionNotFound.FastGenByArgs(regionID) + } + + oldPeer := region.GetStorePeer(fromStoreID) + if oldPeer == nil { + return errors.Errorf("region has no peer in store %v", fromStoreID) + } + + if err := checkStoreState(c, toStoreID); err != nil { + return err + } + + newPeer := &metapb.Peer{StoreId: toStoreID, Role: oldPeer.GetRole(), IsWitness: oldPeer.GetIsWitness()} + op, err := operator.CreateMovePeerOperator("admin-move-peer", c, region, operator.OpAdmin, fromStoreID, newPeer) + if err != nil { + log.Debug("fail to create move peer operator", errs.ZapError(err)) + return err + } + return h.addOperator(op) +} + +// checkAdminAddPeerOperator checks adminAddPeer operator with given region ID and store ID. +func (h *Handler) checkAdminAddPeerOperator(regionID uint64, toStoreID uint64) (sche.SharedCluster, *core.RegionInfo, error) { + c := h.GetCluster() + region := c.GetRegion(regionID) + if region == nil { + return nil, nil, errs.ErrRegionNotFound.FastGenByArgs(regionID) + } + + if region.GetStorePeer(toStoreID) != nil { + return nil, nil, errors.Errorf("region already has peer in store %v", toStoreID) + } + + if err := checkStoreState(c, toStoreID); err != nil { + return nil, nil, err + } + + return c, region, nil +} + +// AddAddPeerOperator adds an operator to add peer. +func (h *Handler) AddAddPeerOperator(regionID uint64, toStoreID uint64) error { + c, region, err := h.checkAdminAddPeerOperator(regionID, toStoreID) + if err != nil { + return err + } + + newPeer := &metapb.Peer{StoreId: toStoreID} + op, err := operator.CreateAddPeerOperator("admin-add-peer", c, region, newPeer, operator.OpAdmin) + if err != nil { + log.Debug("fail to create add peer operator", errs.ZapError(err)) + return err + } + return h.addOperator(op) +} + +// AddAddLearnerOperator adds an operator to add learner. +func (h *Handler) AddAddLearnerOperator(regionID uint64, toStoreID uint64) error { + c, region, err := h.checkAdminAddPeerOperator(regionID, toStoreID) + if err != nil { + return err + } + + newPeer := &metapb.Peer{ + StoreId: toStoreID, + Role: metapb.PeerRole_Learner, + } + + op, err := operator.CreateAddPeerOperator("admin-add-learner", c, region, newPeer, operator.OpAdmin) + if err != nil { + log.Debug("fail to create add learner operator", errs.ZapError(err)) + return err + } + return h.addOperator(op) +} + +// AddRemovePeerOperator adds an operator to remove peer. +func (h *Handler) AddRemovePeerOperator(regionID uint64, fromStoreID uint64) error { + c := h.GetCluster() + region := c.GetRegion(regionID) + if region == nil { + return errs.ErrRegionNotFound.FastGenByArgs(regionID) + } + + if region.GetStorePeer(fromStoreID) == nil { + return errors.Errorf("region has no peer in store %v", fromStoreID) + } + + op, err := operator.CreateRemovePeerOperator("admin-remove-peer", c, operator.OpAdmin, region, fromStoreID) + if err != nil { + log.Debug("fail to create move peer operator", errs.ZapError(err)) + return err + } + return h.addOperator(op) +} + +// AddMergeRegionOperator adds an operator to merge region. +func (h *Handler) AddMergeRegionOperator(regionID uint64, targetID uint64) error { + c := h.GetCluster() + region := c.GetRegion(regionID) + if region == nil { + return errs.ErrRegionNotFound.FastGenByArgs(regionID) + } + + target := c.GetRegion(targetID) + if target == nil { + return errs.ErrRegionNotFound.FastGenByArgs(targetID) + } + + if !filter.IsRegionHealthy(region) || !filter.IsRegionReplicated(c, region) { + return errs.ErrRegionAbnormalPeer.FastGenByArgs(regionID) + } + + if !filter.IsRegionHealthy(target) || !filter.IsRegionReplicated(c, target) { + return errs.ErrRegionAbnormalPeer.FastGenByArgs(targetID) + } + + // for the case first region (start key is nil) with the last region (end key is nil) but not adjacent + if (!bytes.Equal(region.GetStartKey(), target.GetEndKey()) || len(region.GetStartKey()) == 0) && + (!bytes.Equal(region.GetEndKey(), target.GetStartKey()) || len(region.GetEndKey()) == 0) { + return errs.ErrRegionNotAdjacent + } + + ops, err := operator.CreateMergeRegionOperator("admin-merge-region", c, region, target, operator.OpAdmin) + if err != nil { + log.Debug("fail to create merge region operator", errs.ZapError(err)) + return err + } + return h.addOperator(ops...) +} + +// AddSplitRegionOperator adds an operator to split a region. +func (h *Handler) AddSplitRegionOperator(regionID uint64, policyStr string, keys []string) error { + c := h.GetCluster() + region := c.GetRegion(regionID) + if region == nil { + return errs.ErrRegionNotFound.FastGenByArgs(regionID) + } + + policy, ok := pdpb.CheckPolicy_value[strings.ToUpper(policyStr)] + if !ok { + return errors.Errorf("check policy %s is not supported", policyStr) + } + + var splitKeys [][]byte + if pdpb.CheckPolicy(policy) == pdpb.CheckPolicy_USEKEY { + for i := range keys { + k, err := hex.DecodeString(keys[i]) + if err != nil { + return errors.Errorf("split key %s is not in hex format", keys[i]) + } + splitKeys = append(splitKeys, k) + } + } + + op, err := operator.CreateSplitRegionOperator("admin-split-region", region, operator.OpAdmin, pdpb.CheckPolicy(policy), splitKeys) + if err != nil { + return err + } + + return h.addOperator(op) +} + +// AddScatterRegionOperator adds an operator to scatter a region. +func (h *Handler) AddScatterRegionOperator(regionID uint64, group string) error { + c := h.GetCluster() + region := c.GetRegion(regionID) + if region == nil { + return errs.ErrRegionNotFound.FastGenByArgs(regionID) + } + + if c.IsRegionHot(region) { + return errors.Errorf("region %d is a hot region", regionID) + } + + s, err := h.GetRegionScatterer() + if err != nil { + return err + } + + op, err := s.Scatter(region, group, false) + if err != nil { + return err + } + + if op == nil { + return nil + } + return h.addOperator(op) +} + +// AddScatterRegionsOperators add operators to scatter regions and return the processed percentage and error +func (h *Handler) AddScatterRegionsOperators(regionIDs []uint64, startRawKey, endRawKey, group string, retryLimit int) (int, error) { + s, err := h.GetRegionScatterer() + if err != nil { + return 0, err + } + opsCount := 0 + var failures map[uint64]error + // If startKey and endKey are both defined, use them first. + if len(startRawKey) > 0 && len(endRawKey) > 0 { + startKey, err := hex.DecodeString(startRawKey) + if err != nil { + return 0, err + } + endKey, err := hex.DecodeString(endRawKey) + if err != nil { + return 0, err + } + opsCount, failures, err = s.ScatterRegionsByRange(startKey, endKey, group, retryLimit) + if err != nil { + return 0, err + } + } else { + opsCount, failures, err = s.ScatterRegionsByID(regionIDs, group, retryLimit, false) + if err != nil { + return 0, err + } + } + percentage := 100 + if len(failures) > 0 { + percentage = 100 - 100*len(failures)/(opsCount+len(failures)) + } + return percentage, nil +} + +func (h *Handler) addOperator(ops ...*operator.Operator) error { + oc, err := h.GetOperatorController() + if err != nil { + return err + } + + if ok := oc.AddOperator(ops...); !ok { + return errors.WithStack(errs.ErrAddOperator) + } + return nil +} + +func checkStoreState(c sche.SharedCluster, storeID uint64) error { + store := c.GetStore(storeID) + if store == nil { + return errs.ErrStoreNotFound.FastGenByArgs(storeID) + } + if store.IsRemoved() { + return errs.ErrStoreRemoved.FastGenByArgs(storeID) + } + if store.IsUnhealthy() { + return errs.ErrStoreUnhealthy.FastGenByArgs(storeID) + } + return nil +} diff --git a/server/api/hot_status.go b/server/api/hot_status.go index 4f64f1bebc5..7779591de1f 100644 --- a/server/api/hot_status.go +++ b/server/api/hot_status.go @@ -22,6 +22,7 @@ import ( "strconv" "github.com/tikv/pd/pkg/core" + "github.com/tikv/pd/pkg/errs" "github.com/tikv/pd/pkg/statistics/buckets" "github.com/tikv/pd/pkg/statistics/utils" "github.com/tikv/pd/pkg/storage" @@ -117,7 +118,7 @@ func (h *hotStatusHandler) GetHotWriteRegions(w http.ResponseWriter, r *http.Req } store := rc.GetStore(id) if store == nil { - h.rd.JSON(w, http.StatusNotFound, server.ErrStoreNotFound(id).Error()) + h.rd.JSON(w, http.StatusNotFound, errs.ErrStoreNotFound.FastGenByArgs(id).Error()) return } ids = append(ids, id) @@ -153,7 +154,7 @@ func (h *hotStatusHandler) GetHotReadRegions(w http.ResponseWriter, r *http.Requ } store := rc.GetStore(id) if store == nil { - h.rd.JSON(w, http.StatusNotFound, server.ErrStoreNotFound(id).Error()) + h.rd.JSON(w, http.StatusNotFound, errs.ErrStoreNotFound.FastGenByArgs(id).Error()) return } ids = append(ids, id) diff --git a/server/api/label.go b/server/api/label.go index abaad02a4e3..b7f279d86cc 100644 --- a/server/api/label.go +++ b/server/api/label.go @@ -21,6 +21,7 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" + "github.com/tikv/pd/pkg/errs" "github.com/tikv/pd/server" "github.com/unrolled/render" ) @@ -87,7 +88,7 @@ func (h *labelsHandler) GetStoresByLabel(w http.ResponseWriter, r *http.Request) storeID := s.GetId() store := rc.GetStore(storeID) if store == nil { - h.rd.JSON(w, http.StatusInternalServerError, server.ErrStoreNotFound(storeID).Error()) + h.rd.JSON(w, http.StatusInternalServerError, errs.ErrStoreNotFound.FastGenByArgs(storeID).Error()) return } diff --git a/server/api/region.go b/server/api/region.go index 1c21af53296..42b430974c4 100644 --- a/server/api/region.go +++ b/server/api/region.go @@ -33,6 +33,7 @@ import ( "github.com/pingcap/kvproto/pkg/replication_modepb" "github.com/pingcap/log" "github.com/tikv/pd/pkg/core" + "github.com/tikv/pd/pkg/errs" "github.com/tikv/pd/pkg/keyspace" "github.com/tikv/pd/pkg/schedule/filter" "github.com/tikv/pd/pkg/statistics" @@ -763,7 +764,7 @@ func (h *regionsHandler) GetRegionSiblings(w http.ResponseWriter, r *http.Reques } region := rc.GetRegion(uint64(id)) if region == nil { - h.rd.JSON(w, http.StatusNotFound, server.ErrRegionNotFound(uint64(id)).Error()) + h.rd.JSON(w, http.StatusNotFound, errs.ErrRegionNotFound.FastGenByArgs(uint64(id)).Error()) return } @@ -1037,7 +1038,7 @@ func (h *regionsHandler) ScatterRegions(w http.ResponseWriter, r *http.Request) h.rd.JSON(w, http.StatusBadRequest, err.Error()) return } - opsCount, failures, err = rc.GetRegionScatter().ScatterRegionsByRange(startKey, endKey, group, retryLimit) + opsCount, failures, err = rc.GetRegionScatterer().ScatterRegionsByRange(startKey, endKey, group, retryLimit) if err != nil { h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return @@ -1048,7 +1049,7 @@ func (h *regionsHandler) ScatterRegions(w http.ResponseWriter, r *http.Request) h.rd.JSON(w, http.StatusBadRequest, "regions_id is invalid") return } - opsCount, failures, err = rc.GetRegionScatter().ScatterRegionsByID(ids, group, retryLimit, false) + opsCount, failures, err = rc.GetRegionScatterer().ScatterRegionsByID(ids, group, retryLimit, false) if err != nil { h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return diff --git a/server/api/rule.go b/server/api/rule.go index 33c63a8faa2..b3a720ece41 100644 --- a/server/api/rule.go +++ b/server/api/rule.go @@ -167,7 +167,7 @@ func (h *ruleHandler) preCheckForRegionAndRule(w http.ResponseWriter, r *http.Re } region := cluster.GetRegion(regionID) if region == nil { - h.rd.JSON(w, http.StatusNotFound, server.ErrRegionNotFound(regionID).Error()) + h.rd.JSON(w, http.StatusNotFound, errs.ErrRegionNotFound.FastGenByArgs(regionID).Error()) return cluster, nil } return cluster, region diff --git a/server/api/store.go b/server/api/store.go index a3e8c4518a2..7c820a3befa 100644 --- a/server/api/store.go +++ b/server/api/store.go @@ -191,7 +191,7 @@ func (h *storeHandler) GetStore(w http.ResponseWriter, r *http.Request) { store := rc.GetStore(storeID) if store == nil { - h.rd.JSON(w, http.StatusNotFound, server.ErrStoreNotFound(storeID).Error()) + h.rd.JSON(w, http.StatusNotFound, errs.ErrStoreNotFound.FastGenByArgs(storeID).Error()) return } @@ -437,7 +437,7 @@ func (h *storeHandler) SetStoreLimit(w http.ResponseWriter, r *http.Request) { store := rc.GetStore(storeID) if store == nil { - h.rd.JSON(w, http.StatusInternalServerError, server.ErrStoreNotFound(storeID).Error()) + h.rd.JSON(w, http.StatusInternalServerError, errs.ErrStoreNotFound.FastGenByArgs(storeID).Error()) return } @@ -758,7 +758,7 @@ func (h *storesHandler) GetAllStores(w http.ResponseWriter, r *http.Request) { storeID := s.GetId() store := rc.GetStore(storeID) if store == nil { - h.rd.JSON(w, http.StatusInternalServerError, server.ErrStoreNotFound(storeID).Error()) + h.rd.JSON(w, http.StatusInternalServerError, errs.ErrStoreNotFound.FastGenByArgs(storeID).Error()) return } diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 1b2bc75e334..94761c330b6 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -759,8 +759,8 @@ func (c *RaftCluster) SetPrepared() { c.coordinator.GetPrepareChecker().SetPrepared() } -// GetRegionScatter returns the region scatter. -func (c *RaftCluster) GetRegionScatter() *scatter.RegionScatterer { +// GetRegionScatterer returns the region scatter. +func (c *RaftCluster) GetRegionScatterer() *scatter.RegionScatterer { return c.coordinator.GetRegionScatterer() } diff --git a/server/grpc_service.go b/server/grpc_service.go index 55b265e32a5..5a483b71818 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -1755,7 +1755,7 @@ func (s *GrpcServer) ScatterRegion(ctx context.Context, request *pdpb.ScatterReg region = core.NewRegionInfo(request.GetRegion(), request.GetLeader()) } - op, err := rc.GetRegionScatter().Scatter(region, request.GetGroup(), request.GetSkipStoreLimit()) + op, err := rc.GetRegionScatterer().Scatter(region, request.GetGroup(), request.GetSkipStoreLimit()) if err != nil { return nil, err } @@ -2152,7 +2152,7 @@ func (s *GrpcServer) SplitAndScatterRegions(ctx context.Context, request *pdpb.S // scatterRegions add operators to scatter regions and return the processed percentage and error func scatterRegions(cluster *cluster.RaftCluster, regionsID []uint64, group string, retryLimit int, skipStoreLimit bool) (int, error) { - opsCount, failures, err := cluster.GetRegionScatter().ScatterRegionsByID(regionsID, group, retryLimit, skipStoreLimit) + opsCount, failures, err := cluster.GetRegionScatterer().ScatterRegionsByID(regionsID, group, retryLimit, skipStoreLimit) if err != nil { return 0, err } diff --git a/server/handler.go b/server/handler.go index a90f8e3f04f..585f362cad8 100644 --- a/server/handler.go +++ b/server/handler.go @@ -15,19 +15,15 @@ package server import ( - "bytes" - "encoding/hex" "encoding/json" "net/http" "net/url" "path" "strconv" - "strings" "time" "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" - "github.com/pingcap/kvproto/pkg/pdpb" "github.com/pingcap/log" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/core/storelimit" @@ -35,9 +31,8 @@ import ( "github.com/tikv/pd/pkg/errs" "github.com/tikv/pd/pkg/schedule" sc "github.com/tikv/pd/pkg/schedule/config" - "github.com/tikv/pd/pkg/schedule/filter" - "github.com/tikv/pd/pkg/schedule/operator" - "github.com/tikv/pd/pkg/schedule/placement" + sche "github.com/tikv/pd/pkg/schedule/core" + "github.com/tikv/pd/pkg/schedule/handler" "github.com/tikv/pd/pkg/schedule/schedulers" "github.com/tikv/pd/pkg/statistics" "github.com/tikv/pd/pkg/statistics/buckets" @@ -50,36 +45,24 @@ import ( "go.uber.org/zap" ) -var ( - // SchedulerConfigHandlerPath is the api router path of the schedule config handler. - SchedulerConfigHandlerPath = "/api/v1/scheduler-config" - - // ErrOperatorNotFound is error info for operator not found. - ErrOperatorNotFound = errors.New("operator not found") - // ErrAddOperator is error info for already have an operator when adding operator. - ErrAddOperator = errors.New("failed to add operator, maybe already have one") - // ErrRegionNotAdjacent is error info for region not adjacent. - ErrRegionNotAdjacent = errors.New("two regions are not adjacent") - // ErrRegionNotFound is error info for region not found. - ErrRegionNotFound = func(regionID uint64) error { - return errors.Errorf("region %v not found", regionID) - } - // ErrRegionAbnormalPeer is error info for region has abnormal peer. - ErrRegionAbnormalPeer = func(regionID uint64) error { - return errors.Errorf("region %v has abnormal peer", regionID) - } - // ErrStoreNotFound is error info for store not found. - ErrStoreNotFound = func(storeID uint64) error { - return errors.Errorf("store %v not found", storeID) - } - // ErrPluginNotFound is error info for plugin not found. - ErrPluginNotFound = func(pluginPath string) error { - return errors.Errorf("plugin is not found: %s", pluginPath) - } -) +// SchedulerConfigHandlerPath is the api router path of the schedule config handler. +var SchedulerConfigHandlerPath = "/api/v1/scheduler-config" + +type server struct { + *Server +} + +func (s *server) GetCoordinator() *schedule.Coordinator { + return s.GetRaftCluster().GetCoordinator() +} + +func (s *server) GetCluster() sche.SharedCluster { + return s.GetRaftCluster() +} // Handler is a helper to export methods to handle API/RPC requests. type Handler struct { + *handler.Handler s *Server opt *config.PersistOptions pluginChMap map[string]chan string @@ -87,7 +70,16 @@ type Handler struct { } func newHandler(s *Server) *Handler { - return &Handler{s: s, opt: s.persistOptions, pluginChMap: make(map[string]chan string), pluginChMapLock: syncutil.RWMutex{}} + h := handler.NewHandler(&server{ + Server: s, + }) + return &Handler{ + Handler: h, + s: s, + opt: s.persistOptions, + pluginChMap: make(map[string]chan string), + pluginChMapLock: syncutil.RWMutex{}, + } } // GetRaftCluster returns RaftCluster. @@ -99,15 +91,6 @@ func (h *Handler) GetRaftCluster() (*cluster.RaftCluster, error) { return rc, nil } -// GetOperatorController returns OperatorController. -func (h *Handler) GetOperatorController() (*operator.Controller, error) { - rc := h.s.GetRaftCluster() - if rc == nil { - return nil, errs.ErrNotBootstrapped.GenWithStackByArgs() - } - return rc.GetOperatorController(), nil -} - // IsSchedulerPaused returns whether scheduler is paused. func (h *Handler) IsSchedulerPaused(name string) (bool, error) { rc, err := h.GetRaftCluster() @@ -170,7 +153,7 @@ func (h *Handler) GetStores() ([]*core.StoreInfo, error) { storeID := s.GetId() store := rc.GetStore(storeID) if store == nil { - return nil, ErrStoreNotFound(storeID) + return nil, errs.ErrStoreNotFound.FastGenByArgs(storeID) } stores = append(stores, store) } @@ -406,119 +389,6 @@ func (h *Handler) AddGrantHotRegionScheduler(leaderID, peers string) error { return h.AddScheduler(schedulers.GrantHotRegionType, leaderID, peers) } -// GetOperator returns the region operator. -func (h *Handler) GetOperator(regionID uint64) (*operator.Operator, error) { - c, err := h.GetOperatorController() - if err != nil { - return nil, err - } - - op := c.GetOperator(regionID) - if op == nil { - return nil, ErrOperatorNotFound - } - - return op, nil -} - -// GetOperatorStatus returns the status of the region operator. -func (h *Handler) GetOperatorStatus(regionID uint64) (*operator.OpWithStatus, error) { - c, err := h.GetOperatorController() - if err != nil { - return nil, err - } - - op := c.GetOperatorStatus(regionID) - if op == nil { - return nil, ErrOperatorNotFound - } - - return op, nil -} - -// RemoveOperator removes the region operator. -func (h *Handler) RemoveOperator(regionID uint64) error { - c, err := h.GetOperatorController() - if err != nil { - return err - } - - op := c.GetOperator(regionID) - if op == nil { - return ErrOperatorNotFound - } - - _ = c.RemoveOperator(op, operator.AdminStop) - return nil -} - -// GetOperators returns the running operators. -func (h *Handler) GetOperators() ([]*operator.Operator, error) { - c, err := h.GetOperatorController() - if err != nil { - return nil, err - } - return c.GetOperators(), nil -} - -// GetWaitingOperators returns the waiting operators. -func (h *Handler) GetWaitingOperators() ([]*operator.Operator, error) { - c, err := h.GetOperatorController() - if err != nil { - return nil, err - } - return c.GetWaitingOperators(), nil -} - -// GetAdminOperators returns the running admin operators. -func (h *Handler) GetAdminOperators() ([]*operator.Operator, error) { - c, err := h.GetOperatorController() - if err != nil { - return nil, err - } - return c.GetOperatorsOfKind(operator.OpAdmin), nil -} - -// GetLeaderOperators returns the running leader operators. -func (h *Handler) GetLeaderOperators() ([]*operator.Operator, error) { - c, err := h.GetOperatorController() - if err != nil { - return nil, err - } - return c.GetOperatorsOfKind(operator.OpLeader), nil -} - -// GetRegionOperators returns the running region operators. -func (h *Handler) GetRegionOperators() ([]*operator.Operator, error) { - c, err := h.GetOperatorController() - if err != nil { - return nil, err - } - return c.GetOperatorsOfKind(operator.OpRegion), nil -} - -// GetHistory returns finished operators' history since start. -func (h *Handler) GetHistory(start time.Time) ([]operator.OpHistory, error) { - c, err := h.GetOperatorController() - if err != nil { - return nil, err - } - return c.GetHistory(start), nil -} - -// GetRecords returns finished operators since start. -func (h *Handler) GetRecords(from time.Time) ([]*operator.OpRecord, error) { - c, err := h.GetOperatorController() - if err != nil { - return nil, err - } - records := c.GetRecords(from) - if len(records) == 0 { - return nil, ErrOperatorNotFound - } - return records, nil -} - // SetAllStoresLimit is used to set limit of all stores. func (h *Handler) SetAllStoresLimit(ratePerMin float64, limitType storelimit.Type) error { c, err := h.GetRaftCluster() @@ -575,349 +445,6 @@ func (h *Handler) SetStoreLimit(storeID uint64, ratePerMin float64, limitType st return c.SetStoreLimit(storeID, limitType, ratePerMin) } -// AddTransferLeaderOperator adds an operator to transfer leader to the store. -func (h *Handler) AddTransferLeaderOperator(regionID uint64, storeID uint64) error { - c, err := h.GetRaftCluster() - if err != nil { - return err - } - - region := c.GetRegion(regionID) - if region == nil { - return ErrRegionNotFound(regionID) - } - - newLeader := region.GetStoreVoter(storeID) - if newLeader == nil { - return errors.Errorf("region has no voter in store %v", storeID) - } - - op, err := operator.CreateTransferLeaderOperator("admin-transfer-leader", c, region, region.GetLeader().GetStoreId(), newLeader.GetStoreId(), []uint64{}, operator.OpAdmin) - if err != nil { - log.Debug("fail to create transfer leader operator", errs.ZapError(err)) - return err - } - if ok := c.GetOperatorController().AddOperator(op); !ok { - return errors.WithStack(ErrAddOperator) - } - return nil -} - -// AddTransferRegionOperator adds an operator to transfer region to the stores. -func (h *Handler) AddTransferRegionOperator(regionID uint64, storeIDs map[uint64]placement.PeerRoleType) error { - c, err := h.GetRaftCluster() - if err != nil { - return err - } - - region := c.GetRegion(regionID) - if region == nil { - return ErrRegionNotFound(regionID) - } - - if c.GetOpts().IsPlacementRulesEnabled() { - // Cannot determine role without peer role when placement rules enabled. Not supported now. - for _, role := range storeIDs { - if len(role) == 0 { - return errors.New("transfer region without peer role is not supported when placement rules enabled") - } - } - } - for id := range storeIDs { - if err := checkStoreState(c, id); err != nil { - return err - } - } - - roles := make(map[uint64]placement.PeerRoleType) - for id, peerRole := range storeIDs { - if peerRole == "" { - peerRole = placement.Voter - } - roles[id] = peerRole - } - op, err := operator.CreateMoveRegionOperator("admin-move-region", c, region, operator.OpAdmin, roles) - if err != nil { - log.Debug("fail to create move region operator", errs.ZapError(err)) - return err - } - if ok := c.GetOperatorController().AddOperator(op); !ok { - return errors.WithStack(ErrAddOperator) - } - return nil -} - -// AddTransferPeerOperator adds an operator to transfer peer. -func (h *Handler) AddTransferPeerOperator(regionID uint64, fromStoreID, toStoreID uint64) error { - c, err := h.GetRaftCluster() - if err != nil { - return err - } - - region := c.GetRegion(regionID) - if region == nil { - return ErrRegionNotFound(regionID) - } - - oldPeer := region.GetStorePeer(fromStoreID) - if oldPeer == nil { - return errors.Errorf("region has no peer in store %v", fromStoreID) - } - - if err := checkStoreState(c, toStoreID); err != nil { - return err - } - - newPeer := &metapb.Peer{StoreId: toStoreID, Role: oldPeer.GetRole(), IsWitness: oldPeer.GetIsWitness()} - op, err := operator.CreateMovePeerOperator("admin-move-peer", c, region, operator.OpAdmin, fromStoreID, newPeer) - if err != nil { - log.Debug("fail to create move peer operator", errs.ZapError(err)) - return err - } - if ok := c.GetOperatorController().AddOperator(op); !ok { - return errors.WithStack(ErrAddOperator) - } - return nil -} - -// checkAdminAddPeerOperator checks adminAddPeer operator with given region ID and store ID. -func (h *Handler) checkAdminAddPeerOperator(regionID uint64, toStoreID uint64) (*cluster.RaftCluster, *core.RegionInfo, error) { - c, err := h.GetRaftCluster() - if err != nil { - return nil, nil, err - } - - region := c.GetRegion(regionID) - if region == nil { - return nil, nil, ErrRegionNotFound(regionID) - } - - if region.GetStorePeer(toStoreID) != nil { - return nil, nil, errors.Errorf("region already has peer in store %v", toStoreID) - } - - if err := checkStoreState(c, toStoreID); err != nil { - return nil, nil, err - } - - return c, region, nil -} - -// AddAddPeerOperator adds an operator to add peer. -func (h *Handler) AddAddPeerOperator(regionID uint64, toStoreID uint64) error { - c, region, err := h.checkAdminAddPeerOperator(regionID, toStoreID) - if err != nil { - return err - } - - newPeer := &metapb.Peer{StoreId: toStoreID} - op, err := operator.CreateAddPeerOperator("admin-add-peer", c, region, newPeer, operator.OpAdmin) - if err != nil { - log.Debug("fail to create add peer operator", errs.ZapError(err)) - return err - } - if ok := c.GetOperatorController().AddOperator(op); !ok { - return errors.WithStack(ErrAddOperator) - } - return nil -} - -// AddAddLearnerOperator adds an operator to add learner. -func (h *Handler) AddAddLearnerOperator(regionID uint64, toStoreID uint64) error { - c, region, err := h.checkAdminAddPeerOperator(regionID, toStoreID) - if err != nil { - return err - } - - newPeer := &metapb.Peer{ - StoreId: toStoreID, - Role: metapb.PeerRole_Learner, - } - - op, err := operator.CreateAddPeerOperator("admin-add-learner", c, region, newPeer, operator.OpAdmin) - if err != nil { - log.Debug("fail to create add learner operator", errs.ZapError(err)) - return err - } - if ok := c.GetOperatorController().AddOperator(op); !ok { - return errors.WithStack(ErrAddOperator) - } - return nil -} - -// AddRemovePeerOperator adds an operator to remove peer. -func (h *Handler) AddRemovePeerOperator(regionID uint64, fromStoreID uint64) error { - c, err := h.GetRaftCluster() - if err != nil { - return err - } - - region := c.GetRegion(regionID) - if region == nil { - return ErrRegionNotFound(regionID) - } - - if region.GetStorePeer(fromStoreID) == nil { - return errors.Errorf("region has no peer in store %v", fromStoreID) - } - - op, err := operator.CreateRemovePeerOperator("admin-remove-peer", c, operator.OpAdmin, region, fromStoreID) - if err != nil { - log.Debug("fail to create move peer operator", errs.ZapError(err)) - return err - } - if ok := c.GetOperatorController().AddOperator(op); !ok { - return errors.WithStack(ErrAddOperator) - } - return nil -} - -// AddMergeRegionOperator adds an operator to merge region. -func (h *Handler) AddMergeRegionOperator(regionID uint64, targetID uint64) error { - c, err := h.GetRaftCluster() - if err != nil { - return err - } - - region := c.GetRegion(regionID) - if region == nil { - return ErrRegionNotFound(regionID) - } - - target := c.GetRegion(targetID) - if target == nil { - return ErrRegionNotFound(targetID) - } - - if !filter.IsRegionHealthy(region) || !filter.IsRegionReplicated(c, region) { - return ErrRegionAbnormalPeer(regionID) - } - - if !filter.IsRegionHealthy(target) || !filter.IsRegionReplicated(c, target) { - return ErrRegionAbnormalPeer(targetID) - } - - // for the case first region (start key is nil) with the last region (end key is nil) but not adjacent - if (!bytes.Equal(region.GetStartKey(), target.GetEndKey()) || len(region.GetStartKey()) == 0) && - (!bytes.Equal(region.GetEndKey(), target.GetStartKey()) || len(region.GetEndKey()) == 0) { - return ErrRegionNotAdjacent - } - - ops, err := operator.CreateMergeRegionOperator("admin-merge-region", c, region, target, operator.OpAdmin) - if err != nil { - log.Debug("fail to create merge region operator", errs.ZapError(err)) - return err - } - if ok := c.GetOperatorController().AddOperator(ops...); !ok { - return errors.WithStack(ErrAddOperator) - } - return nil -} - -// AddSplitRegionOperator adds an operator to split a region. -func (h *Handler) AddSplitRegionOperator(regionID uint64, policyStr string, keys []string) error { - c, err := h.GetRaftCluster() - if err != nil { - return err - } - - region := c.GetRegion(regionID) - if region == nil { - return ErrRegionNotFound(regionID) - } - - policy, ok := pdpb.CheckPolicy_value[strings.ToUpper(policyStr)] - if !ok { - return errors.Errorf("check policy %s is not supported", policyStr) - } - - var splitKeys [][]byte - if pdpb.CheckPolicy(policy) == pdpb.CheckPolicy_USEKEY { - for i := range keys { - k, err := hex.DecodeString(keys[i]) - if err != nil { - return errors.Errorf("split key %s is not in hex format", keys[i]) - } - splitKeys = append(splitKeys, k) - } - } - - op, err := operator.CreateSplitRegionOperator("admin-split-region", region, operator.OpAdmin, pdpb.CheckPolicy(policy), splitKeys) - if err != nil { - return err - } - - if ok := c.GetOperatorController().AddOperator(op); !ok { - return errors.WithStack(ErrAddOperator) - } - return nil -} - -// AddScatterRegionOperator adds an operator to scatter a region. -func (h *Handler) AddScatterRegionOperator(regionID uint64, group string) error { - c, err := h.GetRaftCluster() - if err != nil { - return err - } - - region := c.GetRegion(regionID) - if region == nil { - return ErrRegionNotFound(regionID) - } - - if c.IsRegionHot(region) { - return errors.Errorf("region %d is a hot region", regionID) - } - - op, err := c.GetRegionScatter().Scatter(region, group, false) - if err != nil { - return err - } - - if op == nil { - return nil - } - if ok := c.GetOperatorController().AddOperator(op); !ok { - return errors.WithStack(ErrAddOperator) - } - return nil -} - -// AddScatterRegionsOperators add operators to scatter regions and return the processed percentage and error -func (h *Handler) AddScatterRegionsOperators(regionIDs []uint64, startRawKey, endRawKey, group string, retryLimit int) (int, error) { - c, err := h.GetRaftCluster() - if err != nil { - return 0, err - } - opsCount := 0 - var failures map[uint64]error - // If startKey and endKey are both defined, use them first. - if len(startRawKey) > 0 && len(endRawKey) > 0 { - startKey, err := hex.DecodeString(startRawKey) - if err != nil { - return 0, err - } - endKey, err := hex.DecodeString(endRawKey) - if err != nil { - return 0, err - } - opsCount, failures, err = c.GetRegionScatter().ScatterRegionsByRange(startKey, endKey, group, retryLimit) - if err != nil { - return 0, err - } - } else { - opsCount, failures, err = c.GetRegionScatter().ScatterRegionsByID(regionIDs, group, retryLimit, false) - if err != nil { - return 0, err - } - } - percentage := 100 - if len(failures) > 0 { - percentage = 100 - 100*len(failures)/(opsCount+len(failures)) - } - return percentage, nil -} - // GetRegionsByType gets the region with specified type. func (h *Handler) GetRegionsByType(typ statistics.RegionStatisticType) ([]*core.RegionInfo, error) { c := h.s.GetRaftCluster() @@ -1009,7 +536,7 @@ func (h *Handler) PluginUnload(pluginPath string) error { ch <- schedule.PluginUnload return nil } - return ErrPluginNotFound(pluginPath) + return errs.ErrPluginNotFound.FastGenByArgs(pluginPath) } // GetAddr returns the server urls for clients. @@ -1100,20 +627,6 @@ func (h *Handler) GetHistoryHotRegionIter( return iter } -func checkStoreState(rc *cluster.RaftCluster, storeID uint64) error { - store := rc.GetStore(storeID) - if store == nil { - return errs.ErrStoreNotFound.FastGenByArgs(storeID) - } - if store.IsRemoved() { - return errs.ErrStoreRemoved.FastGenByArgs(storeID) - } - if store.IsUnhealthy() { - return errs.ErrStoreUnhealthy.FastGenByArgs(storeID) - } - return nil -} - // RedirectSchedulerUpdate update scheduler config. Export this func to help handle damaged store. func (h *Handler) redirectSchedulerUpdate(name string, storeID float64) error { input := make(map[string]interface{}) From ec4f4a976a7292946ee2c04addcf0150cf4ca177 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Wed, 20 Sep 2023 11:49:42 +0800 Subject: [PATCH 04/14] schedulers, handler: fix the evict-leader-scheduler sync (#7105) ref tikv/pd#5839 - Pass the correct `removeSchedulerCb` when the server is in API mode. - Pause and resume the store leader transfer when reloading the cfg. Signed-off-by: JmPotato Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/mcs/scheduling/server/cluster.go | 1 + pkg/schedule/schedulers/evict_leader.go | 13 +++ server/handler.go | 8 +- .../mcs/scheduling/server_test.go | 92 +++++++++++++------ tests/server/api/testutil.go | 25 ++++- 5 files changed, 109 insertions(+), 30 deletions(-) diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index 917831ba9ca..eb6876db0a1 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -204,6 +204,7 @@ func (c *Cluster) updateScheduler() { log.Info("cluster is closing, stop listening the schedulers updating notifier") return case <-notifier: + // This is triggered by the watcher when the schedulers are updated. } log.Info("schedulers updating notifier is triggered, try to update the scheduler") diff --git a/pkg/schedule/schedulers/evict_leader.go b/pkg/schedule/schedulers/evict_leader.go index 1989c42ba6f..2551b9ac9cb 100644 --- a/pkg/schedule/schedulers/evict_leader.go +++ b/pkg/schedule/schedulers/evict_leader.go @@ -218,6 +218,19 @@ func (s *evictLeaderScheduler) ReloadConfig() error { if err = DecodeConfig([]byte(cfgData), newCfg); err != nil { return err } + // Resume and pause the leader transfer for each store. + for id := range s.conf.StoreIDWithRanges { + if _, ok := newCfg.StoreIDWithRanges[id]; ok { + continue + } + s.conf.cluster.ResumeLeaderTransfer(id) + } + for id := range newCfg.StoreIDWithRanges { + if _, ok := s.conf.StoreIDWithRanges[id]; ok { + continue + } + s.conf.cluster.PauseLeaderTransfer(id) + } s.conf.StoreIDWithRanges = newCfg.StoreIDWithRanges return nil } diff --git a/server/handler.go b/server/handler.go index 585f362cad8..ecc337b7193 100644 --- a/server/handler.go +++ b/server/handler.go @@ -214,7 +214,13 @@ func (h *Handler) AddScheduler(name string, args ...string) error { return err } - s, err := schedulers.CreateScheduler(name, c.GetOperatorController(), h.s.storage, schedulers.ConfigSliceDecoder(name, args), c.GetCoordinator().GetSchedulersController().RemoveScheduler) + var removeSchedulerCb func(string) error + if h.s.IsAPIServiceMode() { + removeSchedulerCb = c.GetCoordinator().GetSchedulersController().RemoveSchedulerHandler + } else { + removeSchedulerCb = c.GetCoordinator().GetSchedulersController().RemoveScheduler + } + s, err := schedulers.CreateScheduler(name, c.GetOperatorController(), h.s.storage, schedulers.ConfigSliceDecoder(name, args), removeSchedulerCb) if err != nil { return err } diff --git a/tests/integrations/mcs/scheduling/server_test.go b/tests/integrations/mcs/scheduling/server_test.go index 187ba54dfcb..e469c593b84 100644 --- a/tests/integrations/mcs/scheduling/server_test.go +++ b/tests/integrations/mcs/scheduling/server_test.go @@ -17,12 +17,14 @@ package scheduling import ( "context" "fmt" + "net/http" "testing" "time" "github.com/pingcap/failpoint" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" + "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" mcs "github.com/tikv/pd/pkg/mcs/utils" "github.com/tikv/pd/pkg/schedule/schedulers" @@ -196,25 +198,15 @@ func (suite *serverTestSuite) TestSchedulerSync() { defer tc.Destroy() tc.WaitForPrimaryServing(re) schedulersController := tc.GetPrimaryServer().GetCluster().GetCoordinator().GetSchedulersController() - re.Len(schedulersController.GetSchedulerNames(), 5) - re.Nil(schedulersController.GetScheduler(schedulers.EvictLeaderName)) + checkEvictLeaderSchedulerExist(re, schedulersController, false) // Add a new evict-leader-scheduler through the API server. api.MustAddScheduler(re, suite.backendEndpoints, schedulers.EvictLeaderName, map[string]interface{}{ "store_id": 1, }) // Check if the evict-leader-scheduler is added. - testutil.Eventually(re, func() bool { - return len(schedulersController.GetSchedulerNames()) == 6 && - schedulersController.GetScheduler(schedulers.EvictLeaderName) != nil - }) - handler, ok := schedulersController.GetSchedulerHandlers()[schedulers.EvictLeaderName] - re.True(ok) - h, ok := handler.(interface { - EvictStoreIDs() []uint64 - }) - re.True(ok) - re.ElementsMatch(h.EvictStoreIDs(), []uint64{1}) - // Update the evict-leader-scheduler through the API server. + checkEvictLeaderSchedulerExist(re, schedulersController, true) + checkEvictLeaderStoreIDs(re, schedulersController, []uint64{1}) + // Add a store_id to the evict-leader-scheduler through the API server. err = suite.pdLeader.GetServer().GetRaftCluster().PutStore( &metapb.Store{ Id: 2, @@ -229,25 +221,69 @@ func (suite *serverTestSuite) TestSchedulerSync() { api.MustAddScheduler(re, suite.backendEndpoints, schedulers.EvictLeaderName, map[string]interface{}{ "store_id": 2, }) - var evictStoreIDs []uint64 - testutil.Eventually(re, func() bool { - evictStoreIDs = h.EvictStoreIDs() - return len(evictStoreIDs) == 2 + checkEvictLeaderSchedulerExist(re, schedulersController, true) + checkEvictLeaderStoreIDs(re, schedulersController, []uint64{1, 2}) + // Delete a store_id from the evict-leader-scheduler through the API server. + api.MustDeleteScheduler(re, suite.backendEndpoints, fmt.Sprintf("%s-%d", schedulers.EvictLeaderName, 1)) + checkEvictLeaderSchedulerExist(re, schedulersController, true) + checkEvictLeaderStoreIDs(re, schedulersController, []uint64{2}) + // Add a store_id to the evict-leader-scheduler through the API server by the scheduler handler. + api.MustCallSchedulerConfigAPI(re, http.MethodPost, suite.backendEndpoints, schedulers.EvictLeaderName, []string{"config"}, map[string]interface{}{ + "name": schedulers.EvictLeaderName, + "store_id": 1, }) - re.ElementsMatch(evictStoreIDs, []uint64{1, 2}) + checkEvictLeaderSchedulerExist(re, schedulersController, true) + checkEvictLeaderStoreIDs(re, schedulersController, []uint64{1, 2}) + // Delete a store_id from the evict-leader-scheduler through the API server by the scheduler handler. + api.MustCallSchedulerConfigAPI(re, http.MethodDelete, suite.backendEndpoints, schedulers.EvictLeaderName, []string{"delete", "2"}, nil) + checkEvictLeaderSchedulerExist(re, schedulersController, true) + checkEvictLeaderStoreIDs(re, schedulersController, []uint64{1}) + // If the last store is deleted, the scheduler should be removed. + api.MustCallSchedulerConfigAPI(re, http.MethodDelete, suite.backendEndpoints, schedulers.EvictLeaderName, []string{"delete", "1"}, nil) + // Check if the scheduler is removed. + checkEvictLeaderSchedulerExist(re, schedulersController, false) + + // Delete the evict-leader-scheduler through the API server by removing the last store_id. + api.MustAddScheduler(re, suite.backendEndpoints, schedulers.EvictLeaderName, map[string]interface{}{ + "store_id": 1, + }) + checkEvictLeaderSchedulerExist(re, schedulersController, true) + checkEvictLeaderStoreIDs(re, schedulersController, []uint64{1}) api.MustDeleteScheduler(re, suite.backendEndpoints, fmt.Sprintf("%s-%d", schedulers.EvictLeaderName, 1)) - testutil.Eventually(re, func() bool { - evictStoreIDs = h.EvictStoreIDs() - return len(evictStoreIDs) == 1 + checkEvictLeaderSchedulerExist(re, schedulersController, false) + + // Delete the evict-leader-scheduler through the API server. + api.MustAddScheduler(re, suite.backendEndpoints, schedulers.EvictLeaderName, map[string]interface{}{ + "store_id": 1, }) - re.ElementsMatch(evictStoreIDs, []uint64{2}) - // Remove the evict-leader-scheduler through the API server. + checkEvictLeaderSchedulerExist(re, schedulersController, true) + checkEvictLeaderStoreIDs(re, schedulersController, []uint64{1}) api.MustDeleteScheduler(re, suite.backendEndpoints, schedulers.EvictLeaderName) - // Check if the scheduler is removed. + checkEvictLeaderSchedulerExist(re, schedulersController, false) + + // TODO: test more schedulers. +} + +func checkEvictLeaderSchedulerExist(re *require.Assertions, sc *schedulers.Controller, exist bool) { testutil.Eventually(re, func() bool { - return len(schedulersController.GetSchedulerNames()) == 5 && - schedulersController.GetScheduler(schedulers.EvictLeaderName) == nil + if !exist { + return sc.GetScheduler(schedulers.EvictLeaderName) == nil + } + return sc.GetScheduler(schedulers.EvictLeaderName) != nil }) +} - // TODO: test more schedulers. +func checkEvictLeaderStoreIDs(re *require.Assertions, sc *schedulers.Controller, expected []uint64) { + handler, ok := sc.GetSchedulerHandlers()[schedulers.EvictLeaderName] + re.True(ok) + h, ok := handler.(interface { + EvictStoreIDs() []uint64 + }) + re.True(ok) + var evictStoreIDs []uint64 + testutil.Eventually(re, func() bool { + evictStoreIDs = h.EvictStoreIDs() + return len(evictStoreIDs) == len(expected) + }) + re.ElementsMatch(evictStoreIDs, expected) } diff --git a/tests/server/api/testutil.go b/tests/server/api/testutil.go index 7a33bf39048..c6c2cc79611 100644 --- a/tests/server/api/testutil.go +++ b/tests/server/api/testutil.go @@ -20,11 +20,15 @@ import ( "fmt" "io" "net/http" + "path" "github.com/stretchr/testify/require" ) -const schedulersPrefix = "/pd/api/v1/schedulers" +const ( + schedulersPrefix = "/pd/api/v1/schedulers" + schedulerConfigPrefix = "/pd/api/v1/scheduler-config" +) // dialClient used to dial http request. var dialClient = &http.Client{ @@ -68,3 +72,22 @@ func MustDeleteScheduler(re *require.Assertions, serverAddr, schedulerName strin re.NoError(err) re.Equal(http.StatusOK, resp.StatusCode, string(data)) } + +// MustCallSchedulerConfigAPI calls a scheduler config with HTTP API with the given args. +func MustCallSchedulerConfigAPI( + re *require.Assertions, + method, serverAddr, schedulerName string, args []string, + input map[string]interface{}, +) { + data, err := json.Marshal(input) + re.NoError(err) + args = append([]string{schedulerConfigPrefix, schedulerName}, args...) + httpReq, err := http.NewRequest(method, fmt.Sprintf("%s%s", serverAddr, path.Join(args...)), bytes.NewBuffer(data)) + re.NoError(err) + resp, err := dialClient.Do(httpReq) + re.NoError(err) + defer resp.Body.Close() + data, err = io.ReadAll(resp.Body) + re.NoError(err) + re.Equal(http.StatusOK, resp.StatusCode, string(data)) +} From b61a3181101cd6bb935ea3ecc6f2d800b27013db Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Wed, 20 Sep 2023 13:58:43 +0800 Subject: [PATCH 05/14] mcs: forward region requests to scheduling server (#7023) ref tikv/pd#5839 Signed-off-by: Ryan Leung --- go.mod | 2 +- go.sum | 4 +- pkg/cluster/cluster.go | 65 +++++++++++ pkg/core/region.go | 62 ++++++++--- pkg/mcs/scheduling/server/cluster.go | 81 +++++++++++++- pkg/mcs/scheduling/server/config/config.go | 1 - pkg/mcs/scheduling/server/grpc_service.go | 104 ++++++++++++++++++ pkg/mcs/scheduling/server/server.go | 2 +- pkg/mock/mockhbstream/mockhbstream.go | 8 +- pkg/mock/mockhbstream/mockhbstream_test.go | 20 ++-- pkg/schedule/hbstream/heartbeat_streams.go | 82 +++++++++++--- pkg/schedule/operator/step.go | 51 ++++----- server/api/stats.go | 2 +- server/cluster/cluster.go | 46 ++++---- server/cluster/cluster_test.go | 8 +- server/grpc_service.go | 97 +++++++++++++++- server/server.go | 2 +- tests/integrations/client/go.mod | 2 +- tests/integrations/client/go.sum | 4 +- tests/integrations/mcs/go.mod | 4 +- tests/integrations/mcs/go.sum | 4 +- .../mcs/scheduling/server_test.go | 79 +++++++++++++ tests/integrations/tso/go.mod | 2 +- tests/integrations/tso/go.sum | 4 +- tests/server/api/api_test.go | 5 + tools/pd-api-bench/go.mod | 2 +- tools/pd-api-bench/go.sum | 4 +- 27 files changed, 614 insertions(+), 133 deletions(-) create mode 100644 pkg/cluster/cluster.go diff --git a/go.mod b/go.mod index bb72c0d5dba..fdde4fa7d19 100644 --- a/go.mod +++ b/go.mod @@ -33,7 +33,7 @@ require ( github.com/pingcap/errcode v0.3.0 github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 - github.com/pingcap/kvproto v0.0.0-20230911090708-d603cce32b96 + github.com/pingcap/kvproto v0.0.0-20230920042517-db656f45023b github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 github.com/pingcap/tidb-dashboard v0.0.0-20230911054332-22add1e00511 diff --git a/go.sum b/go.sum index a007acf869e..5cff08df752 100644 --- a/go.sum +++ b/go.sum @@ -439,8 +439,8 @@ github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c/go.mod h1:X2r9ue github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 h1:C3N3itkduZXDZFh4N3vQ5HEtld3S+Y+StULhWVvumU0= github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00/go.mod h1:4qGtCB0QK0wBzKtFEGDhxXnSnbQApw1gc9siScUl8ew= github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w= -github.com/pingcap/kvproto v0.0.0-20230911090708-d603cce32b96 h1:Upb52Po0Ev1lPKQdUT4suRwQ5Z49A7gEmJ0trADKftM= -github.com/pingcap/kvproto v0.0.0-20230911090708-d603cce32b96/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= +github.com/pingcap/kvproto v0.0.0-20230920042517-db656f45023b h1:XwwIxepR+uuSYWhdQtstEdr67XUE7X6lpSIHVh5iWjs= +github.com/pingcap/kvproto v0.0.0-20230920042517-db656f45023b/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8IDP+SZrdhV1Kibl9KrHxJ9eciw= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go new file mode 100644 index 00000000000..0b3e0351b16 --- /dev/null +++ b/pkg/cluster/cluster.go @@ -0,0 +1,65 @@ +// Copyright 2023 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cluster + +import ( + "github.com/tikv/pd/pkg/core" + "github.com/tikv/pd/pkg/schedule" + "github.com/tikv/pd/pkg/schedule/placement" + "github.com/tikv/pd/pkg/statistics" +) + +// Cluster provides an overview of a cluster's basic information. +type Cluster interface { + GetHotStat() *statistics.HotStat + GetRegionStats() *statistics.RegionStatistics + GetLabelStats() *statistics.LabelStatistics + GetCoordinator() *schedule.Coordinator + GetRuleManager() *placement.RuleManager +} + +// HandleStatsAsync handles the flow asynchronously. +func HandleStatsAsync(c Cluster, region *core.RegionInfo) { + c.GetHotStat().CheckWriteAsync(statistics.NewCheckExpiredItemTask(region)) + c.GetHotStat().CheckReadAsync(statistics.NewCheckExpiredItemTask(region)) + reportInterval := region.GetInterval() + interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() + for _, peer := range region.GetPeers() { + peerInfo := core.NewPeerInfo(peer, region.GetWriteLoads(), interval) + c.GetHotStat().CheckWriteAsync(statistics.NewCheckPeerTask(peerInfo, region)) + } + c.GetCoordinator().GetSchedulersController().CheckTransferWitnessLeader(region) +} + +// HandleOverlaps handles the overlap regions. +func HandleOverlaps(c Cluster, overlaps []*core.RegionInfo) { + for _, item := range overlaps { + if c.GetRegionStats() != nil { + c.GetRegionStats().ClearDefunctRegion(item.GetID()) + } + c.GetLabelStats().ClearDefunctRegion(item.GetID()) + c.GetRuleManager().InvalidCache(item.GetID()) + } +} + +// Collect collects the cluster information. +func Collect(c Cluster, region *core.RegionInfo, stores []*core.StoreInfo, hasRegionStats, isNew, isPrepared bool) { + if hasRegionStats { + c.GetRegionStats().Observe(region, stores) + } + if !isPrepared && isNew { + c.GetCoordinator().GetPrepareChecker().Collect(region) + } +} diff --git a/pkg/core/region.go b/pkg/core/region.go index 57ba5cb3db0..4540f7aafb3 100644 --- a/pkg/core/region.go +++ b/pkg/core/region.go @@ -143,8 +143,31 @@ const ( InitClusterRegionThreshold = 100 ) +// RegionHeartbeatResponse is the interface for region heartbeat response. +type RegionHeartbeatResponse interface { + GetTargetPeer() *metapb.Peer + GetRegionId() uint64 +} + +// RegionHeartbeatRequest is the interface for region heartbeat request. +type RegionHeartbeatRequest interface { + GetTerm() uint64 + GetRegion() *metapb.Region + GetLeader() *metapb.Peer + GetDownPeers() []*pdpb.PeerStats + GetPendingPeers() []*metapb.Peer + GetBytesWritten() uint64 + GetKeysWritten() uint64 + GetBytesRead() uint64 + GetKeysRead() uint64 + GetInterval() *pdpb.TimeInterval + GetQueryStats() *pdpb.QueryStats + GetApproximateSize() uint64 + GetApproximateKeys() uint64 +} + // RegionFromHeartbeat constructs a Region from region heartbeat. -func RegionFromHeartbeat(heartbeat *pdpb.RegionHeartbeatRequest, opts ...RegionCreateOption) *RegionInfo { +func RegionFromHeartbeat(heartbeat RegionHeartbeatRequest, opts ...RegionCreateOption) *RegionInfo { // Convert unit to MB. // If region isn't empty and less than 1MB, use 1MB instead. regionSize := heartbeat.GetApproximateSize() / units.MiB @@ -153,25 +176,28 @@ func RegionFromHeartbeat(heartbeat *pdpb.RegionHeartbeatRequest, opts ...RegionC if heartbeat.GetApproximateSize() > 0 && regionSize < EmptyRegionApproximateSize { regionSize = EmptyRegionApproximateSize } - regionKvSize := heartbeat.GetApproximateKvSize() / units.MiB region := &RegionInfo{ - term: heartbeat.GetTerm(), - meta: heartbeat.GetRegion(), - leader: heartbeat.GetLeader(), - downPeers: heartbeat.GetDownPeers(), - pendingPeers: heartbeat.GetPendingPeers(), - cpuUsage: heartbeat.GetCpuUsage(), - writtenBytes: heartbeat.GetBytesWritten(), - writtenKeys: heartbeat.GetKeysWritten(), - readBytes: heartbeat.GetBytesRead(), - readKeys: heartbeat.GetKeysRead(), - approximateSize: int64(regionSize), - approximateKvSize: int64(regionKvSize), - approximateKeys: int64(heartbeat.GetApproximateKeys()), - interval: heartbeat.GetInterval(), - replicationStatus: heartbeat.GetReplicationStatus(), - queryStats: heartbeat.GetQueryStats(), + term: heartbeat.GetTerm(), + meta: heartbeat.GetRegion(), + leader: heartbeat.GetLeader(), + downPeers: heartbeat.GetDownPeers(), + pendingPeers: heartbeat.GetPendingPeers(), + writtenBytes: heartbeat.GetBytesWritten(), + writtenKeys: heartbeat.GetKeysWritten(), + readBytes: heartbeat.GetBytesRead(), + readKeys: heartbeat.GetKeysRead(), + approximateSize: int64(regionSize), + approximateKeys: int64(heartbeat.GetApproximateKeys()), + interval: heartbeat.GetInterval(), + queryStats: heartbeat.GetQueryStats(), + } + + // scheduling service doesn't need the following fields. + if h, ok := heartbeat.(*pdpb.RegionHeartbeatRequest); ok { + region.approximateKvSize = int64(h.GetApproximateKvSize() / units.MiB) + region.replicationStatus = h.GetReplicationStatus() + region.cpuUsage = h.GetCpuUsage() } for _, opt := range opts { diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index eb6876db0a1..0b9924f230b 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -10,6 +10,7 @@ import ( "github.com/pingcap/kvproto/pkg/pdpb" "github.com/pingcap/kvproto/pkg/schedulingpb" "github.com/pingcap/log" + "github.com/tikv/pd/pkg/cluster" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/errs" "github.com/tikv/pd/pkg/mcs/scheduling/server/config" @@ -17,6 +18,7 @@ import ( sc "github.com/tikv/pd/pkg/schedule/config" "github.com/tikv/pd/pkg/schedule/hbstream" "github.com/tikv/pd/pkg/schedule/labeler" + "github.com/tikv/pd/pkg/schedule/operator" "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/schedule/schedulers" "github.com/tikv/pd/pkg/slice" @@ -38,7 +40,7 @@ type Cluster struct { ruleManager *placement.RuleManager labelerManager *labeler.RegionLabeler regionStats *statistics.RegionStatistics - labelLevelStats *statistics.LabelStatistics + labelStats *statistics.LabelStatistics hotStat *statistics.HotStat storage storage.Storage coordinator *schedule.Coordinator @@ -66,8 +68,8 @@ func NewCluster(parentCtx context.Context, persistConfig *config.PersistConfig, labelerManager: labelerManager, persistConfig: persistConfig, hotStat: statistics.NewHotStat(ctx), + labelStats: statistics.NewLabelStatistics(), regionStats: statistics.NewRegionStatistics(basicCluster, persistConfig, ruleManager), - labelLevelStats: statistics.NewLabelStatistics(), storage: storage, clusterID: clusterID, checkMembershipCh: checkMembershipCh, @@ -86,6 +88,21 @@ func (c *Cluster) GetCoordinator() *schedule.Coordinator { return c.coordinator } +// GetHotStat gets hot stat. +func (c *Cluster) GetHotStat() *statistics.HotStat { + return c.hotStat +} + +// GetRegionStats gets region statistics. +func (c *Cluster) GetRegionStats() *statistics.RegionStatistics { + return c.regionStats +} + +// GetLabelStats gets label statistics. +func (c *Cluster) GetLabelStats() *statistics.LabelStatistics { + return c.labelStats +} + // GetBasicCluster returns the basic cluster. func (c *Cluster) GetBasicCluster() *core.BasicCluster { return c.BasicCluster @@ -287,7 +304,7 @@ func (c *Cluster) waitSchedulersInitialized() { // UpdateRegionsLabelLevelStats updates the status of the region label level by types. func (c *Cluster) UpdateRegionsLabelLevelStats(regions []*core.RegionInfo) { for _, region := range regions { - c.labelLevelStats.Observe(region, c.getStoresWithoutLabelLocked(region, core.EngineKey, core.EngineTiFlash), c.persistConfig.GetLocationLabels()) + c.labelStats.Observe(region, c.getStoresWithoutLabelLocked(region, core.EngineKey, core.EngineTiFlash), c.persistConfig.GetLocationLabels()) } } @@ -389,3 +406,61 @@ func (c *Cluster) StopBackgroundJobs() { c.cancel() c.wg.Wait() } + +// HandleRegionHeartbeat processes RegionInfo reports from client. +func (c *Cluster) HandleRegionHeartbeat(region *core.RegionInfo) error { + if err := c.processRegionHeartbeat(region); err != nil { + return err + } + + c.coordinator.GetOperatorController().Dispatch(region, operator.DispatchFromHeartBeat, c.coordinator.RecordOpStepWithTTL) + return nil +} + +// processRegionHeartbeat updates the region information. +func (c *Cluster) processRegionHeartbeat(region *core.RegionInfo) error { + origin, _, err := c.PreCheckPutRegion(region) + if err != nil { + return err + } + if c.GetStoreConfig().IsEnableRegionBucket() { + region.InheritBuckets(origin) + } + + cluster.HandleStatsAsync(c, region) + + hasRegionStats := c.regionStats != nil + // Save to storage if meta is updated, except for flashback. + // Save to cache if meta or leader is updated, or contains any down/pending peer. + // Mark isNew if the region in cache does not have leader. + isNew, _, saveCache, _ := core.GenerateRegionGuideFunc(true)(region, origin) + if !saveCache && !isNew { + // Due to some config changes need to update the region stats as well, + // so we do some extra checks here. + if hasRegionStats && c.regionStats.RegionStatsNeedUpdate(region) { + c.regionStats.Observe(region, c.GetRegionStores(region)) + } + return nil + } + + var overlaps []*core.RegionInfo + if saveCache { + // To prevent a concurrent heartbeat of another region from overriding the up-to-date region info by a stale one, + // check its validation again here. + // + // However it can't solve the race condition of concurrent heartbeats from the same region. + if overlaps, err = c.AtomicCheckAndPutRegion(region); err != nil { + return err + } + + cluster.HandleOverlaps(c, overlaps) + } + + cluster.Collect(c, region, c.GetRegionStores(region), hasRegionStats, isNew, c.IsPrepared()) + return nil +} + +// IsPrepared return true if the prepare checker is ready. +func (c *Cluster) IsPrepared() bool { + return c.coordinator.GetPrepareChecker().IsPrepared() +} diff --git a/pkg/mcs/scheduling/server/config/config.go b/pkg/mcs/scheduling/server/config/config.go index 82c15632b3d..e1d680069ce 100644 --- a/pkg/mcs/scheduling/server/config/config.go +++ b/pkg/mcs/scheduling/server/config/config.go @@ -110,7 +110,6 @@ func (c *Config) Parse(flagSet *pflag.FlagSet) error { configutil.AdjustCommandLineString(flagSet, &c.BackendEndpoints, "backend-endpoints") configutil.AdjustCommandLineString(flagSet, &c.ListenAddr, "listen-addr") configutil.AdjustCommandLineString(flagSet, &c.AdvertiseListenAddr, "advertise-listen-addr") - return c.adjust(meta) } diff --git a/pkg/mcs/scheduling/server/grpc_service.go b/pkg/mcs/scheduling/server/grpc_service.go index f615e0c37c0..4558688822a 100644 --- a/pkg/mcs/scheduling/server/grpc_service.go +++ b/pkg/mcs/scheduling/server/grpc_service.go @@ -16,13 +16,19 @@ package server import ( "context" + "io" "net/http" + "sync/atomic" + "time" + "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/schedulingpb" "github.com/pingcap/log" bs "github.com/tikv/pd/pkg/basicserver" + "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/mcs/registry" "github.com/tikv/pd/pkg/utils/apiutil" + "github.com/tikv/pd/pkg/utils/logutil" "go.uber.org/zap" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -67,6 +73,104 @@ func NewService[T ConfigProvider](svr bs.Server) registry.RegistrableService { } } +// heartbeatServer wraps Scheduling_RegionHeartbeatServer to ensure when any error +// occurs on Send() or Recv(), both endpoints will be closed. +type heartbeatServer struct { + stream schedulingpb.Scheduling_RegionHeartbeatServer + closed int32 +} + +func (s *heartbeatServer) Send(m core.RegionHeartbeatResponse) error { + if atomic.LoadInt32(&s.closed) == 1 { + return io.EOF + } + done := make(chan error, 1) + go func() { + defer logutil.LogPanic() + done <- s.stream.Send(m.(*schedulingpb.RegionHeartbeatResponse)) + }() + timer := time.NewTimer(5 * time.Second) + defer timer.Stop() + select { + case err := <-done: + if err != nil { + atomic.StoreInt32(&s.closed, 1) + } + return errors.WithStack(err) + case <-timer.C: + atomic.StoreInt32(&s.closed, 1) + return status.Errorf(codes.DeadlineExceeded, "send heartbeat timeout") + } +} + +func (s *heartbeatServer) Recv() (*schedulingpb.RegionHeartbeatRequest, error) { + if atomic.LoadInt32(&s.closed) == 1 { + return nil, io.EOF + } + req, err := s.stream.Recv() + if err != nil { + atomic.StoreInt32(&s.closed, 1) + return nil, errors.WithStack(err) + } + return req, nil +} + +// RegionHeartbeat implements gRPC PDServer. +func (s *Service) RegionHeartbeat(stream schedulingpb.Scheduling_RegionHeartbeatServer) error { + var ( + server = &heartbeatServer{stream: stream} + cancel context.CancelFunc + lastBind time.Time + ) + defer func() { + // cancel the forward stream + if cancel != nil { + cancel() + } + }() + + for { + request, err := server.Recv() + if err == io.EOF { + return nil + } + if err != nil { + return errors.WithStack(err) + } + + c := s.GetCluster() + if c == nil { + resp := &schedulingpb.RegionHeartbeatResponse{Header: &schedulingpb.ResponseHeader{ + ClusterId: s.clusterID, + Error: &schedulingpb.Error{ + Type: schedulingpb.ErrorType_NOT_BOOTSTRAPPED, + Message: "scheduling server is not initialized yet", + }, + }} + err := server.Send(resp) + return errors.WithStack(err) + } + + storeID := request.GetLeader().GetStoreId() + store := c.GetStore(storeID) + if store == nil { + return errors.Errorf("invalid store ID %d, not found", storeID) + } + + if time.Since(lastBind) > time.Minute { + s.hbStreams.BindStream(storeID, server) + lastBind = time.Now() + } + region := core.RegionFromHeartbeat(request, core.SetFromHeartbeat(true)) + err = c.HandleRegionHeartbeat(region) + if err != nil { + // TODO: if we need to send the error back to API server. + log.Error("failed handle region heartbeat", zap.Error(err)) + continue + } + } +} + // StoreHeartbeat implements gRPC PDServer. func (s *Service) StoreHeartbeat(ctx context.Context, request *schedulingpb.StoreHeartbeatRequest) (*schedulingpb.StoreHeartbeatResponse, error) { c := s.GetCluster() diff --git a/pkg/mcs/scheduling/server/server.go b/pkg/mcs/scheduling/server/server.go index f4c5c676dd3..fd7621bf2cb 100644 --- a/pkg/mcs/scheduling/server/server.go +++ b/pkg/mcs/scheduling/server/server.go @@ -437,7 +437,7 @@ func (s *Server) startCluster(context.Context) error { if err != nil { return err } - s.hbStreams = hbstream.NewHeartbeatStreams(s.Context(), s.clusterID, s.basicCluster) + s.hbStreams = hbstream.NewHeartbeatStreams(s.Context(), s.clusterID, utils.SchedulingServiceName, s.basicCluster) s.cluster, err = NewCluster(s.Context(), s.persistConfig, s.storage, s.basicCluster, s.hbStreams, s.clusterID, s.checkMembershipCh) if err != nil { return err diff --git a/pkg/mock/mockhbstream/mockhbstream.go b/pkg/mock/mockhbstream/mockhbstream.go index c94042bf102..289f31d63dd 100644 --- a/pkg/mock/mockhbstream/mockhbstream.go +++ b/pkg/mock/mockhbstream/mockhbstream.go @@ -25,18 +25,18 @@ import ( // HeartbeatStream is used to mock HeartbeatStream for test use. type HeartbeatStream struct { - ch chan *pdpb.RegionHeartbeatResponse + ch chan core.RegionHeartbeatResponse } // NewHeartbeatStream creates a new HeartbeatStream. func NewHeartbeatStream() HeartbeatStream { return HeartbeatStream{ - ch: make(chan *pdpb.RegionHeartbeatResponse), + ch: make(chan core.RegionHeartbeatResponse), } } // Send mocks method. -func (s HeartbeatStream) Send(m *pdpb.RegionHeartbeatResponse) error { +func (s HeartbeatStream) Send(m core.RegionHeartbeatResponse) error { select { case <-time.After(time.Second): return errors.New("timeout") @@ -52,7 +52,7 @@ func (s HeartbeatStream) SendMsg(region *core.RegionInfo, msg *pdpb.RegionHeartb func (s HeartbeatStream) BindStream(storeID uint64, stream hbstream.HeartbeatStream) {} // Recv mocks method. -func (s HeartbeatStream) Recv() *pdpb.RegionHeartbeatResponse { +func (s HeartbeatStream) Recv() core.RegionHeartbeatResponse { select { case <-time.After(time.Millisecond * 10): return nil diff --git a/pkg/mock/mockhbstream/mockhbstream_test.go b/pkg/mock/mockhbstream/mockhbstream_test.go index 46af7df534b..a8e88f61aee 100644 --- a/pkg/mock/mockhbstream/mockhbstream_test.go +++ b/pkg/mock/mockhbstream/mockhbstream_test.go @@ -22,12 +22,10 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" "github.com/stretchr/testify/require" - "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/mock/mockcluster" "github.com/tikv/pd/pkg/mock/mockconfig" "github.com/tikv/pd/pkg/schedule/hbstream" "github.com/tikv/pd/pkg/utils/testutil" - "github.com/tikv/pd/pkg/utils/typeutil" ) func TestActivity(t *testing.T) { @@ -41,37 +39,33 @@ func TestActivity(t *testing.T) { cluster.AddRegionStore(2, 0) cluster.AddLeaderRegion(1, 1) region := cluster.GetRegion(1) - msg := &pdpb.RegionHeartbeatResponse{ - ChangePeer: &pdpb.ChangePeer{Peer: &metapb.Peer{Id: 2, StoreId: 2}, ChangeType: eraftpb.ConfChangeType_AddLearnerNode}, - } - hbs := hbstream.NewTestHeartbeatStreams(ctx, cluster.ID, cluster, true) stream1, stream2 := NewHeartbeatStream(), NewHeartbeatStream() // Active stream is stream1. hbs.BindStream(1, stream1) testutil.Eventually(re, func() bool { - newMsg := typeutil.DeepClone(msg, core.RegionHeartbeatResponseFactory) - hbs.SendMsg(region, newMsg) + msg := &hbstream.Operation{ChangePeer: &pdpb.ChangePeer{Peer: &metapb.Peer{Id: 2, StoreId: 2}, ChangeType: eraftpb.ConfChangeType_AddLearnerNode}} + hbs.SendMsg(region, msg) return stream1.Recv() != nil && stream2.Recv() == nil }) // Rebind to stream2. hbs.BindStream(1, stream2) testutil.Eventually(re, func() bool { - newMsg := typeutil.DeepClone(msg, core.RegionHeartbeatResponseFactory) - hbs.SendMsg(region, newMsg) + msg := &hbstream.Operation{ChangePeer: &pdpb.ChangePeer{Peer: &metapb.Peer{Id: 2, StoreId: 2}, ChangeType: eraftpb.ConfChangeType_AddLearnerNode}} + hbs.SendMsg(region, msg) return stream1.Recv() == nil && stream2.Recv() != nil }) // SendErr to stream2. hbs.SendErr(pdpb.ErrorType_UNKNOWN, "test error", &metapb.Peer{Id: 1, StoreId: 1}) res := stream2.Recv() re.NotNil(res) - re.NotNil(res.GetHeader().GetError()) + re.NotNil(res.(*pdpb.RegionHeartbeatResponse).GetHeader().GetError()) // Switch back to 1 again. hbs.BindStream(1, stream1) testutil.Eventually(re, func() bool { - newMsg := typeutil.DeepClone(msg, core.RegionHeartbeatResponseFactory) - hbs.SendMsg(region, newMsg) + msg := &hbstream.Operation{ChangePeer: &pdpb.ChangePeer{Peer: &metapb.Peer{Id: 2, StoreId: 2}, ChangeType: eraftpb.ConfChangeType_AddLearnerNode}} + hbs.SendMsg(region, msg) return stream1.Recv() != nil && stream2.Recv() == nil }) } diff --git a/pkg/schedule/hbstream/heartbeat_streams.go b/pkg/schedule/hbstream/heartbeat_streams.go index d80b6ff3a46..e7d7f688035 100644 --- a/pkg/schedule/hbstream/heartbeat_streams.go +++ b/pkg/schedule/hbstream/heartbeat_streams.go @@ -23,16 +23,30 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" + "github.com/pingcap/kvproto/pkg/schedulingpb" "github.com/pingcap/log" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/errs" + "github.com/tikv/pd/pkg/mcs/utils" "github.com/tikv/pd/pkg/utils/logutil" "go.uber.org/zap" ) +// Operation is detailed scheduling step of a region. +type Operation struct { + ChangePeer *pdpb.ChangePeer + // Pd can return transfer_leader to let TiKV does leader transfer itself. + TransferLeader *pdpb.TransferLeader + Merge *pdpb.Merge + // PD sends split_region to let TiKV split a region into two regions. + SplitRegion *pdpb.SplitRegion + ChangePeerV2 *pdpb.ChangePeerV2 + SwitchWitnesses *pdpb.BatchSwitchWitness +} + // HeartbeatStream is an interface. type HeartbeatStream interface { - Send(*pdpb.RegionHeartbeatResponse) error + Send(core.RegionHeartbeatResponse) error } const ( @@ -52,33 +66,35 @@ type HeartbeatStreams struct { hbStreamCancel context.CancelFunc clusterID uint64 streams map[uint64]HeartbeatStream - msgCh chan *pdpb.RegionHeartbeatResponse + msgCh chan core.RegionHeartbeatResponse streamCh chan streamUpdate storeInformer core.StoreSetInformer + typ string needRun bool // For test only. } // NewHeartbeatStreams creates a new HeartbeatStreams which enable background running by default. -func NewHeartbeatStreams(ctx context.Context, clusterID uint64, storeInformer core.StoreSetInformer) *HeartbeatStreams { - return newHbStreams(ctx, clusterID, storeInformer, true) +func NewHeartbeatStreams(ctx context.Context, clusterID uint64, typ string, storeInformer core.StoreSetInformer) *HeartbeatStreams { + return newHbStreams(ctx, clusterID, typ, storeInformer, true) } // NewTestHeartbeatStreams creates a new HeartbeatStreams for test purpose only. // Please use NewHeartbeatStreams for other usage. func NewTestHeartbeatStreams(ctx context.Context, clusterID uint64, storeInformer core.StoreSetInformer, needRun bool) *HeartbeatStreams { - return newHbStreams(ctx, clusterID, storeInformer, needRun) + return newHbStreams(ctx, clusterID, "", storeInformer, needRun) } -func newHbStreams(ctx context.Context, clusterID uint64, storeInformer core.StoreSetInformer, needRun bool) *HeartbeatStreams { +func newHbStreams(ctx context.Context, clusterID uint64, typ string, storeInformer core.StoreSetInformer, needRun bool) *HeartbeatStreams { hbStreamCtx, hbStreamCancel := context.WithCancel(ctx) hs := &HeartbeatStreams{ hbStreamCtx: hbStreamCtx, hbStreamCancel: hbStreamCancel, clusterID: clusterID, streams: make(map[uint64]HeartbeatStream), - msgCh: make(chan *pdpb.RegionHeartbeatResponse, heartbeatChanCapacity), + msgCh: make(chan core.RegionHeartbeatResponse, heartbeatChanCapacity), streamCh: make(chan streamUpdate, 1), storeInformer: storeInformer, + typ: typ, needRun: needRun, } if needRun { @@ -96,7 +112,13 @@ func (s *HeartbeatStreams) run() { keepAliveTicker := time.NewTicker(heartbeatStreamKeepAliveInterval) defer keepAliveTicker.Stop() - keepAlive := &pdpb.RegionHeartbeatResponse{Header: &pdpb.ResponseHeader{ClusterId: s.clusterID}} + var keepAlive core.RegionHeartbeatResponse + switch s.typ { + case utils.SchedulingServiceName: + keepAlive = &schedulingpb.RegionHeartbeatResponse{Header: &schedulingpb.ResponseHeader{ClusterId: s.clusterID}} + default: + keepAlive = &pdpb.RegionHeartbeatResponse{Header: &pdpb.ResponseHeader{ClusterId: s.clusterID}} + } for { select { @@ -108,7 +130,7 @@ func (s *HeartbeatStreams) run() { store := s.storeInformer.GetStore(storeID) if store == nil { log.Error("failed to get store", - zap.Uint64("region-id", msg.RegionId), + zap.Uint64("region-id", msg.GetRegionId()), zap.Uint64("store-id", storeID), errs.ZapError(errs.ErrGetSourceStore)) delete(s.streams, storeID) continue @@ -117,7 +139,7 @@ func (s *HeartbeatStreams) run() { if stream, ok := s.streams[storeID]; ok { if err := stream.Send(msg); err != nil { log.Error("send heartbeat message fail", - zap.Uint64("region-id", msg.RegionId), errs.ZapError(errs.ErrGRPCSend.Wrap(err).GenWithStackByArgs())) + zap.Uint64("region-id", msg.GetRegionId()), errs.ZapError(errs.ErrGRPCSend.Wrap(err).GenWithStackByArgs())) delete(s.streams, storeID) heartbeatStreamCounter.WithLabelValues(storeAddress, storeLabel, "push", "err").Inc() } else { @@ -125,7 +147,7 @@ func (s *HeartbeatStreams) run() { } } else { log.Debug("heartbeat stream not found, skip send message", - zap.Uint64("region-id", msg.RegionId), + zap.Uint64("region-id", msg.GetRegionId()), zap.Uint64("store-id", storeID)) heartbeatStreamCounter.WithLabelValues(storeAddress, storeLabel, "push", "skip").Inc() } @@ -174,18 +196,44 @@ func (s *HeartbeatStreams) BindStream(storeID uint64, stream HeartbeatStream) { } // SendMsg sends a message to related store. -func (s *HeartbeatStreams) SendMsg(region *core.RegionInfo, msg *pdpb.RegionHeartbeatResponse) { +func (s *HeartbeatStreams) SendMsg(region *core.RegionInfo, op *Operation) { if region.GetLeader() == nil { return } - msg.Header = &pdpb.ResponseHeader{ClusterId: s.clusterID} - msg.RegionId = region.GetID() - msg.RegionEpoch = region.GetRegionEpoch() - msg.TargetPeer = region.GetLeader() + // TODO: use generic + var resp core.RegionHeartbeatResponse + switch s.typ { + case utils.SchedulingServiceName: + resp = &schedulingpb.RegionHeartbeatResponse{ + Header: &schedulingpb.ResponseHeader{ClusterId: s.clusterID}, + RegionId: region.GetID(), + RegionEpoch: region.GetRegionEpoch(), + TargetPeer: region.GetLeader(), + ChangePeer: op.ChangePeer, + TransferLeader: op.TransferLeader, + Merge: op.Merge, + SplitRegion: op.SplitRegion, + ChangePeerV2: op.ChangePeerV2, + SwitchWitnesses: op.SwitchWitnesses, + } + default: + resp = &pdpb.RegionHeartbeatResponse{ + Header: &pdpb.ResponseHeader{ClusterId: s.clusterID}, + RegionId: region.GetID(), + RegionEpoch: region.GetRegionEpoch(), + TargetPeer: region.GetLeader(), + ChangePeer: op.ChangePeer, + TransferLeader: op.TransferLeader, + Merge: op.Merge, + SplitRegion: op.SplitRegion, + ChangePeerV2: op.ChangePeerV2, + SwitchWitnesses: op.SwitchWitnesses, + } + } select { - case s.msgCh <- msg: + case s.msgCh <- resp: case <-s.hbStreamCtx.Done(): } } diff --git a/pkg/schedule/operator/step.go b/pkg/schedule/operator/step.go index 1a2107cb265..6f14cbb326b 100644 --- a/pkg/schedule/operator/step.go +++ b/pkg/schedule/operator/step.go @@ -28,6 +28,7 @@ import ( "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/core/storelimit" "github.com/tikv/pd/pkg/schedule/config" + "github.com/tikv/pd/pkg/schedule/hbstream" "github.com/tikv/pd/pkg/utils/typeutil" "go.uber.org/zap" ) @@ -57,7 +58,7 @@ type OpStep interface { CheckInProgress(ci *core.BasicCluster, config config.SharedConfigProvider, region *core.RegionInfo) error Influence(opInfluence OpInfluence, region *core.RegionInfo) Timeout(regionSize int64) time.Duration - GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *pdpb.RegionHeartbeatResponse + GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *hbstream.Operation } // TransferLeader is an OpStep that transfers a region's leader. @@ -126,12 +127,12 @@ func (tl TransferLeader) Timeout(regionSize int64) time.Duration { } // GetCmd returns the schedule command for heartbeat response. -func (tl TransferLeader) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *pdpb.RegionHeartbeatResponse { +func (tl TransferLeader) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *hbstream.Operation { peers := make([]*metapb.Peer, 0, len(tl.ToStores)) for _, storeID := range tl.ToStores { peers = append(peers, region.GetStorePeer(storeID)) } - return &pdpb.RegionHeartbeatResponse{ + return &hbstream.Operation{ TransferLeader: &pdpb.TransferLeader{ Peer: region.GetStorePeer(tl.ToStore), Peers: peers, @@ -210,7 +211,7 @@ func (ap AddPeer) Timeout(regionSize int64) time.Duration { } // GetCmd returns the schedule command for heartbeat response. -func (ap AddPeer) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *pdpb.RegionHeartbeatResponse { +func (ap AddPeer) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *hbstream.Operation { peer := region.GetStorePeer(ap.ToStore) if peer != nil { // The newly added peer is pending. @@ -274,7 +275,7 @@ func (bw BecomeWitness) Timeout(regionSize int64) time.Duration { } // GetCmd returns the schedule command for heartbeat response. -func (bw BecomeWitness) GetCmd(_ *core.RegionInfo, _ bool) *pdpb.RegionHeartbeatResponse { +func (bw BecomeWitness) GetCmd(_ *core.RegionInfo, _ bool) *hbstream.Operation { return switchWitness(bw.PeerID, true) } @@ -342,7 +343,7 @@ func (bn BecomeNonWitness) Timeout(regionSize int64) time.Duration { } // GetCmd returns the schedule command for heartbeat response. -func (bn BecomeNonWitness) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *pdpb.RegionHeartbeatResponse { +func (bn BecomeNonWitness) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *hbstream.Operation { return switchWitness(bn.PeerID, false) } @@ -426,7 +427,7 @@ func (bsw BatchSwitchWitness) Timeout(regionSize int64) time.Duration { } // GetCmd returns the schedule command for heartbeat response. -func (bsw BatchSwitchWitness) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *pdpb.RegionHeartbeatResponse { +func (bsw BatchSwitchWitness) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *hbstream.Operation { switches := make([]*pdpb.SwitchWitness, 0, len(bsw.ToWitnesses)+len(bsw.ToNonWitnesses)) for _, w := range bsw.ToWitnesses { switches = append(switches, w.GetCmd(region, useConfChangeV2).SwitchWitnesses.SwitchWitnesses...) @@ -434,7 +435,7 @@ func (bsw BatchSwitchWitness) GetCmd(region *core.RegionInfo, useConfChangeV2 bo for _, nw := range bsw.ToNonWitnesses { switches = append(switches, nw.GetCmd(region, useConfChangeV2).SwitchWitnesses.SwitchWitnesses...) } - return &pdpb.RegionHeartbeatResponse{ + return &hbstream.Operation{ SwitchWitnesses: &pdpb.BatchSwitchWitness{ SwitchWitnesses: switches, }, @@ -522,7 +523,7 @@ func (al AddLearner) Timeout(regionSize int64) time.Duration { } // GetCmd returns the schedule command for heartbeat response. -func (al AddLearner) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *pdpb.RegionHeartbeatResponse { +func (al AddLearner) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *hbstream.Operation { if region.GetStorePeer(al.ToStore) != nil { // The newly added peer is pending. return nil @@ -581,7 +582,7 @@ func (pl PromoteLearner) Timeout(regionSize int64) time.Duration { } // GetCmd returns the schedule command for heartbeat response. -func (pl PromoteLearner) GetCmd(_ *core.RegionInfo, useConfChangeV2 bool) *pdpb.RegionHeartbeatResponse { +func (pl PromoteLearner) GetCmd(_ *core.RegionInfo, useConfChangeV2 bool) *hbstream.Operation { return createResponse(addNode(pl.PeerID, pl.ToStore, pl.IsWitness), useConfChangeV2) } @@ -652,7 +653,7 @@ func (rp RemovePeer) Timeout(regionSize int64) time.Duration { } // GetCmd returns the schedule command for heartbeat response. -func (rp RemovePeer) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *pdpb.RegionHeartbeatResponse { +func (rp RemovePeer) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *hbstream.Operation { return createResponse(&pdpb.ChangePeer{ ChangeType: eraftpb.ConfChangeType_RemoveNode, Peer: region.GetStorePeer(rp.FromStore), @@ -714,11 +715,11 @@ func (mr MergeRegion) Timeout(regionSize int64) time.Duration { } // GetCmd returns the schedule command for heartbeat response. -func (mr MergeRegion) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *pdpb.RegionHeartbeatResponse { +func (mr MergeRegion) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *hbstream.Operation { if mr.IsPassive { return nil } - return &pdpb.RegionHeartbeatResponse{ + return &hbstream.Operation{ Merge: &pdpb.Merge{ Target: mr.ToRegion, }, @@ -768,8 +769,8 @@ func (sr SplitRegion) Timeout(regionSize int64) time.Duration { } // GetCmd returns the schedule command for heartbeat response. -func (sr SplitRegion) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *pdpb.RegionHeartbeatResponse { - return &pdpb.RegionHeartbeatResponse{ +func (sr SplitRegion) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *hbstream.Operation { + return &hbstream.Operation{ SplitRegion: &pdpb.SplitRegion{ Policy: sr.Policy, Keys: sr.SplitKeys, @@ -818,7 +819,7 @@ func (dv DemoteVoter) Timeout(regionSize int64) time.Duration { } // GetCmd returns the schedule command for heartbeat response. -func (dv DemoteVoter) GetCmd(_ *core.RegionInfo, useConfChangeV2 bool) *pdpb.RegionHeartbeatResponse { +func (dv DemoteVoter) GetCmd(_ *core.RegionInfo, useConfChangeV2 bool) *hbstream.Operation { return createResponse(addLearnerNode(dv.PeerID, dv.ToStore, dv.IsWitness), useConfChangeV2) } @@ -940,7 +941,7 @@ func (cpe ChangePeerV2Enter) Timeout(regionSize int64) time.Duration { } // GetCmd returns the schedule command for heartbeat response. -func (cpe ChangePeerV2Enter) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *pdpb.RegionHeartbeatResponse { +func (cpe ChangePeerV2Enter) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *hbstream.Operation { if !useConfChangeV2 { // only supported in ChangePeerV2 return nil @@ -952,7 +953,7 @@ func (cpe ChangePeerV2Enter) GetCmd(region *core.RegionInfo, useConfChangeV2 boo for _, dv := range cpe.DemoteVoters { changes = append(changes, dv.GetCmd(region, useConfChangeV2).ChangePeerV2.Changes...) } - return &pdpb.RegionHeartbeatResponse{ + return &hbstream.Operation{ ChangePeerV2: &pdpb.ChangePeerV2{ Changes: changes, }, @@ -1080,12 +1081,12 @@ func (cpl ChangePeerV2Leave) Timeout(regionSize int64) time.Duration { } // GetCmd returns the schedule command for heartbeat response. -func (cpl ChangePeerV2Leave) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *pdpb.RegionHeartbeatResponse { +func (cpl ChangePeerV2Leave) GetCmd(region *core.RegionInfo, useConfChangeV2 bool) *hbstream.Operation { if !useConfChangeV2 { // only supported in ChangePeerV2 return nil } - return &pdpb.RegionHeartbeatResponse{ + return &hbstream.Operation{ ChangePeerV2: &pdpb.ChangePeerV2{}, } } @@ -1143,21 +1144,21 @@ func addLearnerNode(id, storeID uint64, isWitness bool) *pdpb.ChangePeer { } } -func createResponse(change *pdpb.ChangePeer, useConfChangeV2 bool) *pdpb.RegionHeartbeatResponse { +func createResponse(change *pdpb.ChangePeer, useConfChangeV2 bool) *hbstream.Operation { if useConfChangeV2 { - return &pdpb.RegionHeartbeatResponse{ + return &hbstream.Operation{ ChangePeerV2: &pdpb.ChangePeerV2{ Changes: []*pdpb.ChangePeer{change}, }, } } - return &pdpb.RegionHeartbeatResponse{ + return &hbstream.Operation{ ChangePeer: change, } } -func switchWitness(peerID uint64, isWitness bool) *pdpb.RegionHeartbeatResponse { - return &pdpb.RegionHeartbeatResponse{ +func switchWitness(peerID uint64, isWitness bool) *hbstream.Operation { + return &hbstream.Operation{ SwitchWitnesses: &pdpb.BatchSwitchWitness{ SwitchWitnesses: []*pdpb.SwitchWitness{{PeerId: peerID, IsWitness: isWitness}}, }, diff --git a/server/api/stats.go b/server/api/stats.go index e8b04ba588e..1798597b6cc 100644 --- a/server/api/stats.go +++ b/server/api/stats.go @@ -48,7 +48,7 @@ func (h *statsHandler) GetRegionStatus(w http.ResponseWriter, r *http.Request) { if r.URL.Query().Has("count") { stats = rc.GetRegionCount([]byte(startKey), []byte(endKey)) } else { - stats = rc.GetRegionStats([]byte(startKey), []byte(endKey)) + stats = rc.GetRegionStatsByRange([]byte(startKey), []byte(endKey)) } h.rd.JSON(w, http.StatusOK, stats) } diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 94761c330b6..dbd640d6e8c 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -33,6 +33,7 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" "github.com/pingcap/log" + "github.com/tikv/pd/pkg/cluster" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/core/storelimit" "github.com/tikv/pd/pkg/errs" @@ -894,11 +895,21 @@ func (c *RaftCluster) GetSuspectRegions() []uint64 { return c.coordinator.GetCheckerController().GetSuspectRegions() } -// GetHotStat gets hot stat for test. +// GetHotStat gets hot stat. func (c *RaftCluster) GetHotStat() *statistics.HotStat { return c.hotStat } +// GetRegionStats gets region statistics. +func (c *RaftCluster) GetRegionStats() *statistics.RegionStatistics { + return c.regionStats +} + +// GetLabelStats gets label statistics. +func (c *RaftCluster) GetLabelStats() *statistics.LabelStatistics { + return c.labelLevelStats +} + // RemoveSuspectRegion removes region from suspect list. func (c *RaftCluster) RemoveSuspectRegion(id uint64) { c.coordinator.GetCheckerController().RemoveSuspectRegion(id) @@ -1099,15 +1110,7 @@ func (c *RaftCluster) processRegionHeartbeat(region *core.RegionInfo) error { } if !c.isAPIServiceMode { - c.hotStat.CheckWriteAsync(statistics.NewCheckExpiredItemTask(region)) - c.hotStat.CheckReadAsync(statistics.NewCheckExpiredItemTask(region)) - reportInterval := region.GetInterval() - interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - for _, peer := range region.GetPeers() { - peerInfo := core.NewPeerInfo(peer, region.GetWriteLoads(), interval) - c.hotStat.CheckWriteAsync(statistics.NewCheckPeerTask(peerInfo, region)) - } - c.coordinator.GetSchedulersController().CheckTransferWitnessLeader(region) + cluster.HandleStatsAsync(c, region) } hasRegionStats := c.regionStats != nil @@ -1140,27 +1143,16 @@ func (c *RaftCluster) processRegionHeartbeat(region *core.RegionInfo) error { if overlaps, err = c.core.AtomicCheckAndPutRegion(region); err != nil { return err } - - for _, item := range overlaps { - if !c.isAPIServiceMode { - if c.regionStats != nil { - c.regionStats.ClearDefunctRegion(item.GetID()) - } - c.labelLevelStats.ClearDefunctRegion(item.GetID()) - } - c.ruleManager.InvalidCache(item.GetID()) + if !c.isAPIServiceMode { + cluster.HandleOverlaps(c, overlaps) } regionUpdateCacheEventCounter.Inc() } if !c.isAPIServiceMode { - if hasRegionStats { - c.regionStats.Observe(region, c.getRegionStoresLocked(region)) - } - } - if !c.IsPrepared() && isNew { - c.coordinator.GetPrepareChecker().Collect(region) + cluster.Collect(c, region, c.GetRegionStores(region), hasRegionStats, isNew, c.IsPrepared()) } + if c.storage != nil { // If there are concurrent heartbeats from the same region, the last write will win even if // writes to storage in the critical area. So don't use mutex to protect it. @@ -2333,8 +2325,8 @@ func (c *RaftCluster) PutMetaCluster(meta *metapb.Cluster) error { return c.putMetaLocked(typeutil.DeepClone(meta, core.ClusterFactory)) } -// GetRegionStats returns region statistics from cluster. -func (c *RaftCluster) GetRegionStats(startKey, endKey []byte) *statistics.RegionStats { +// GetRegionStatsByRange returns region statistics from cluster. +func (c *RaftCluster) GetRegionStatsByRange(startKey, endKey []byte) *statistics.RegionStats { return statistics.GetRegionStats(c.core.ScanRegions(startKey, endKey, -1)) } diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index 5679fd6128d..c9d4d0f8f61 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -3637,7 +3637,7 @@ func TestInterval(t *testing.T) { func waitAddLearner(re *require.Assertions, stream mockhbstream.HeartbeatStream, region *core.RegionInfo, storeID uint64) *core.RegionInfo { var res *pdpb.RegionHeartbeatResponse testutil.Eventually(re, func() bool { - if res = stream.Recv(); res != nil { + if res = stream.Recv().(*pdpb.RegionHeartbeatResponse); res != nil { return res.GetRegionId() == region.GetID() && res.GetChangePeer().GetChangeType() == eraftpb.ConfChangeType_AddLearnerNode && res.GetChangePeer().GetPeer().GetStoreId() == storeID @@ -3653,7 +3653,7 @@ func waitAddLearner(re *require.Assertions, stream mockhbstream.HeartbeatStream, func waitPromoteLearner(re *require.Assertions, stream mockhbstream.HeartbeatStream, region *core.RegionInfo, storeID uint64) *core.RegionInfo { var res *pdpb.RegionHeartbeatResponse testutil.Eventually(re, func() bool { - if res = stream.Recv(); res != nil { + if res = stream.Recv().(*pdpb.RegionHeartbeatResponse); res != nil { return res.GetRegionId() == region.GetID() && res.GetChangePeer().GetChangeType() == eraftpb.ConfChangeType_AddNode && res.GetChangePeer().GetPeer().GetStoreId() == storeID @@ -3670,7 +3670,7 @@ func waitPromoteLearner(re *require.Assertions, stream mockhbstream.HeartbeatStr func waitRemovePeer(re *require.Assertions, stream mockhbstream.HeartbeatStream, region *core.RegionInfo, storeID uint64) *core.RegionInfo { var res *pdpb.RegionHeartbeatResponse testutil.Eventually(re, func() bool { - if res = stream.Recv(); res != nil { + if res = stream.Recv().(*pdpb.RegionHeartbeatResponse); res != nil { return res.GetRegionId() == region.GetID() && res.GetChangePeer().GetChangeType() == eraftpb.ConfChangeType_RemoveNode && res.GetChangePeer().GetPeer().GetStoreId() == storeID @@ -3686,7 +3686,7 @@ func waitRemovePeer(re *require.Assertions, stream mockhbstream.HeartbeatStream, func waitTransferLeader(re *require.Assertions, stream mockhbstream.HeartbeatStream, region *core.RegionInfo, storeID uint64) *core.RegionInfo { var res *pdpb.RegionHeartbeatResponse testutil.Eventually(re, func() bool { - if res = stream.Recv(); res != nil { + if res = stream.Recv().(*pdpb.RegionHeartbeatResponse); res != nil { if res.GetRegionId() == region.GetID() { for _, peer := range append(res.GetTransferLeader().GetPeers(), res.GetTransferLeader().GetPeer()) { if peer.GetStoreId() == storeID { diff --git a/server/grpc_service.go b/server/grpc_service.go index 5a483b71818..5e40bc1c732 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -1093,14 +1093,14 @@ type heartbeatServer struct { closed int32 } -func (s *heartbeatServer) Send(m *pdpb.RegionHeartbeatResponse) error { +func (s *heartbeatServer) Send(m core.RegionHeartbeatResponse) error { if atomic.LoadInt32(&s.closed) == 1 { return io.EOF } done := make(chan error, 1) go func() { defer logutil.LogPanic() - done <- s.stream.Send(m) + done <- s.stream.Send(m.(*pdpb.RegionHeartbeatResponse)) }() timer := time.NewTimer(heartbeatSendTimeout) defer timer.Stop() @@ -1232,6 +1232,9 @@ func (s *GrpcServer) RegionHeartbeat(stream pdpb.PD_RegionHeartbeatServer) error lastForwardedHost string lastBind time.Time errCh chan error + schedulingStream schedulingpb.Scheduling_RegionHeartbeatClient + cancel1 context.CancelFunc + lastPrimaryAddr string ) defer func() { // cancel the forward stream @@ -1345,6 +1348,55 @@ func (s *GrpcServer) RegionHeartbeat(stream pdpb.PD_RegionHeartbeatServer) error s.hbStreams.SendErr(pdpb.ErrorType_UNKNOWN, msg, request.GetLeader()) continue } + + if s.IsAPIServiceMode() { + ctx := stream.Context() + primaryAddr, _ := s.GetServicePrimaryAddr(ctx, utils.SchedulingServiceName) + if schedulingStream == nil || lastPrimaryAddr != primaryAddr { + if cancel1 != nil { + cancel1() + } + client, err := s.getDelegateClient(ctx, primaryAddr) + if err != nil { + log.Error("get delegate client failed", zap.Error(err)) + } + + log.Info("create region heartbeat forward stream", zap.String("forwarded-host", primaryAddr)) + schedulingStream, cancel1, err = s.createSchedulingStream(client) + if err != nil { + log.Error("create region heartbeat forward stream failed", zap.Error(err)) + } else { + lastPrimaryAddr = primaryAddr + errCh = make(chan error, 1) + go forwardSchedulingToServer(schedulingStream, server, errCh) + } + } + if schedulingStream != nil { + req := &schedulingpb.RegionHeartbeatRequest{ + Header: &schedulingpb.RequestHeader{ + ClusterId: request.GetHeader().GetClusterId(), + SenderId: request.GetHeader().GetSenderId(), + }, + Region: request.GetRegion(), + Leader: request.GetLeader(), + DownPeers: request.GetDownPeers(), + PendingPeers: request.GetPendingPeers(), + BytesWritten: request.GetBytesWritten(), + BytesRead: request.GetBytesRead(), + KeysWritten: request.GetKeysWritten(), + KeysRead: request.GetKeysRead(), + ApproximateSize: request.GetApproximateSize(), + ApproximateKeys: request.GetApproximateKeys(), + Interval: request.GetInterval(), + Term: request.GetTerm(), + QueryStats: request.GetQueryStats(), + } + if err := schedulingStream.Send(req); err != nil { + log.Error("forward region heartbeat failed", zap.Error(err)) + } + } + } + regionHeartbeatHandleDuration.WithLabelValues(storeAddress, storeLabel).Observe(time.Since(start).Seconds()) regionHeartbeatCounter.WithLabelValues(storeAddress, storeLabel, "report", "ok").Inc() } @@ -2294,6 +2346,47 @@ func forwardRegionHeartbeatClientToServer(forwardStream pdpb.PD_RegionHeartbeatC } } +func (s *GrpcServer) createSchedulingStream(client *grpc.ClientConn) (schedulingpb.Scheduling_RegionHeartbeatClient, context.CancelFunc, error) { + done := make(chan struct{}) + ctx, cancel := context.WithCancel(s.ctx) + go grpcutil.CheckStream(ctx, cancel, done) + forwardStream, err := schedulingpb.NewSchedulingClient(client).RegionHeartbeat(ctx) + done <- struct{}{} + return forwardStream, cancel, err +} + +func forwardSchedulingToServer(forwardStream schedulingpb.Scheduling_RegionHeartbeatClient, server *heartbeatServer, errCh chan error) { + defer logutil.LogPanic() + defer close(errCh) + for { + resp, err := forwardStream.Recv() + if err != nil { + errCh <- errors.WithStack(err) + return + } + response := &pdpb.RegionHeartbeatResponse{ + Header: &pdpb.ResponseHeader{ + ClusterId: resp.GetHeader().GetClusterId(), + // ignore error here + }, + ChangePeer: resp.GetChangePeer(), + TransferLeader: resp.GetTransferLeader(), + RegionId: resp.GetRegionId(), + RegionEpoch: resp.GetRegionEpoch(), + TargetPeer: resp.GetTargetPeer(), + Merge: resp.GetMerge(), + SplitRegion: resp.GetSplitRegion(), + ChangePeerV2: resp.GetChangePeerV2(), + SwitchWitnesses: resp.GetSwitchWitnesses(), + } + + if err := server.Send(response); err != nil { + errCh <- errors.WithStack(err) + return + } + } +} + func (s *GrpcServer) createTSOForwardStream( ctx context.Context, client *grpc.ClientConn, ) (tsopb.TSO_TsoClient, context.Context, context.CancelFunc, error) { diff --git a/server/server.go b/server/server.go index 9e72477368d..03e036a968e 100644 --- a/server/server.go +++ b/server/server.go @@ -485,7 +485,7 @@ func (s *Server) startServer(ctx context.Context) error { } s.keyspaceManager = keyspace.NewKeyspaceManager(s.ctx, s.storage, s.cluster, keyspaceIDAllocator, &s.cfg.Keyspace, s.keyspaceGroupManager) s.safePointV2Manager = gc.NewSafePointManagerV2(s.ctx, s.storage, s.storage, s.storage) - s.hbStreams = hbstream.NewHeartbeatStreams(ctx, s.clusterID, s.cluster) + s.hbStreams = hbstream.NewHeartbeatStreams(ctx, s.clusterID, "", s.cluster) // initial hot_region_storage in here. s.hotRegionStorage, err = storage.NewHotRegionsStorage( ctx, filepath.Join(s.cfg.DataDir, "hot-region"), s.encryptionKeyManager, s.handler) diff --git a/tests/integrations/client/go.mod b/tests/integrations/client/go.mod index cbb2c2d5f46..5423ac85689 100644 --- a/tests/integrations/client/go.mod +++ b/tests/integrations/client/go.mod @@ -13,7 +13,7 @@ replace google.golang.org/grpc v1.54.0 => google.golang.org/grpc v1.26.0 require ( github.com/docker/go-units v0.4.0 github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 - github.com/pingcap/kvproto v0.0.0-20230911090708-d603cce32b96 + github.com/pingcap/kvproto v0.0.0-20230920042517-db656f45023b github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 github.com/stretchr/testify v1.8.2 github.com/tikv/pd v0.0.0-00010101000000-000000000000 diff --git a/tests/integrations/client/go.sum b/tests/integrations/client/go.sum index f18313cb3bb..1578b194c3c 100644 --- a/tests/integrations/client/go.sum +++ b/tests/integrations/client/go.sum @@ -402,8 +402,8 @@ github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c/go.mod h1:X2r9ue github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 h1:C3N3itkduZXDZFh4N3vQ5HEtld3S+Y+StULhWVvumU0= github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00/go.mod h1:4qGtCB0QK0wBzKtFEGDhxXnSnbQApw1gc9siScUl8ew= github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w= -github.com/pingcap/kvproto v0.0.0-20230911090708-d603cce32b96 h1:Upb52Po0Ev1lPKQdUT4suRwQ5Z49A7gEmJ0trADKftM= -github.com/pingcap/kvproto v0.0.0-20230911090708-d603cce32b96/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= +github.com/pingcap/kvproto v0.0.0-20230920042517-db656f45023b h1:XwwIxepR+uuSYWhdQtstEdr67XUE7X6lpSIHVh5iWjs= +github.com/pingcap/kvproto v0.0.0-20230920042517-db656f45023b/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= github.com/pingcap/log v0.0.0-20191012051959-b742a5d432e9/go.mod h1:4rbK1p9ILyIfb6hU7OG2CiWSqMXnp3JMbiaVJ6mvoY8= github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8IDP+SZrdhV1Kibl9KrHxJ9eciw= diff --git a/tests/integrations/mcs/go.mod b/tests/integrations/mcs/go.mod index 50e816cd3ad..2c1dd3b26a1 100644 --- a/tests/integrations/mcs/go.mod +++ b/tests/integrations/mcs/go.mod @@ -11,8 +11,9 @@ replace ( replace google.golang.org/grpc v1.54.0 => google.golang.org/grpc v1.26.0 require ( + github.com/docker/go-units v0.4.0 github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 - github.com/pingcap/kvproto v0.0.0-20230911090708-d603cce32b96 + github.com/pingcap/kvproto v0.0.0-20230920042517-db656f45023b github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 github.com/stretchr/testify v1.8.2 github.com/tikv/pd v0.0.0-00010101000000-000000000000 @@ -57,7 +58,6 @@ require ( github.com/coreos/go-systemd v0.0.0-20190719114852-fd7a80b32e1f // indirect github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f // indirect github.com/davecgh/go-spew v1.1.1 // indirect - github.com/docker/go-units v0.4.0 // indirect github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4 // indirect github.com/elliotchance/pie/v2 v2.1.0 // indirect github.com/fogleman/gg v1.3.0 // indirect diff --git a/tests/integrations/mcs/go.sum b/tests/integrations/mcs/go.sum index 8afbfcdc70e..7260a0b36de 100644 --- a/tests/integrations/mcs/go.sum +++ b/tests/integrations/mcs/go.sum @@ -407,8 +407,8 @@ github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c/go.mod h1:X2r9ue github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 h1:C3N3itkduZXDZFh4N3vQ5HEtld3S+Y+StULhWVvumU0= github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00/go.mod h1:4qGtCB0QK0wBzKtFEGDhxXnSnbQApw1gc9siScUl8ew= github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w= -github.com/pingcap/kvproto v0.0.0-20230911090708-d603cce32b96 h1:Upb52Po0Ev1lPKQdUT4suRwQ5Z49A7gEmJ0trADKftM= -github.com/pingcap/kvproto v0.0.0-20230911090708-d603cce32b96/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= +github.com/pingcap/kvproto v0.0.0-20230920042517-db656f45023b h1:XwwIxepR+uuSYWhdQtstEdr67XUE7X6lpSIHVh5iWjs= +github.com/pingcap/kvproto v0.0.0-20230920042517-db656f45023b/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8IDP+SZrdhV1Kibl9KrHxJ9eciw= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= diff --git a/tests/integrations/mcs/scheduling/server_test.go b/tests/integrations/mcs/scheduling/server_test.go index e469c593b84..45c25f01d1e 100644 --- a/tests/integrations/mcs/scheduling/server_test.go +++ b/tests/integrations/mcs/scheduling/server_test.go @@ -18,9 +18,11 @@ import ( "context" "fmt" "net/http" + "reflect" "testing" "time" + "github.com/docker/go-units" "github.com/pingcap/failpoint" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" @@ -287,3 +289,80 @@ func checkEvictLeaderStoreIDs(re *require.Assertions, sc *schedulers.Controller, }) re.ElementsMatch(evictStoreIDs, expected) } + +func (suite *serverTestSuite) TestForwardRegionHeartbeat() { + re := suite.Require() + tc, err := tests.NewTestSchedulingCluster(suite.ctx, 1, suite.backendEndpoints) + re.NoError(err) + defer tc.Destroy() + tc.WaitForPrimaryServing(re) + + s := &server.GrpcServer{Server: suite.pdLeader.GetServer()} + for i := uint64(1); i <= 3; i++ { + resp, err := s.PutStore( + context.Background(), &pdpb.PutStoreRequest{ + Header: &pdpb.RequestHeader{ClusterId: suite.pdLeader.GetClusterID()}, + Store: &metapb.Store{ + Id: i, + Address: fmt.Sprintf("mock://%d", i), + State: metapb.StoreState_Up, + Version: "7.0.0", + }, + }, + ) + re.NoError(err) + re.Empty(resp.GetHeader().GetError()) + } + + grpcPDClient := testutil.MustNewGrpcClient(re, suite.pdLeader.GetServer().GetAddr()) + stream, err := grpcPDClient.RegionHeartbeat(suite.ctx) + re.NoError(err) + peers := []*metapb.Peer{ + {Id: 11, StoreId: 1}, + {Id: 22, StoreId: 2}, + {Id: 33, StoreId: 3}, + } + queryStats := &pdpb.QueryStats{ + Get: 5, + Coprocessor: 6, + Scan: 7, + Put: 8, + Delete: 9, + DeleteRange: 10, + AcquirePessimisticLock: 11, + Rollback: 12, + Prewrite: 13, + Commit: 14, + } + interval := &pdpb.TimeInterval{StartTimestamp: 0, EndTimestamp: 10} + downPeers := []*pdpb.PeerStats{{Peer: peers[2], DownSeconds: 100}} + pendingPeers := []*metapb.Peer{peers[2]} + regionReq := &pdpb.RegionHeartbeatRequest{ + Header: testutil.NewRequestHeader(suite.pdLeader.GetClusterID()), + Region: &metapb.Region{Id: 10, Peers: peers, StartKey: []byte("a"), EndKey: []byte("b")}, + Leader: peers[0], + DownPeers: downPeers, + PendingPeers: pendingPeers, + BytesWritten: 10, + BytesRead: 20, + KeysWritten: 100, + KeysRead: 200, + ApproximateSize: 30 * units.MiB, + ApproximateKeys: 300, + Interval: interval, + QueryStats: queryStats, + Term: 1, + CpuUsage: 100, + } + err = stream.Send(regionReq) + re.NoError(err) + testutil.Eventually(re, func() bool { + region := tc.GetPrimaryServer().GetCluster().GetRegion(10) + return region.GetBytesRead() == 20 && region.GetBytesWritten() == 10 && + region.GetKeysRead() == 200 && region.GetKeysWritten() == 100 && region.GetTerm() == 1 && + region.GetApproximateKeys() == 300 && region.GetApproximateSize() == 30 && + reflect.DeepEqual(region.GetLeader(), peers[0]) && + reflect.DeepEqual(region.GetInterval(), interval) && region.GetReadQueryNum() == 18 && region.GetWriteQueryNum() == 77 && + reflect.DeepEqual(region.GetDownPeers(), downPeers) && reflect.DeepEqual(region.GetPendingPeers(), pendingPeers) + }) +} diff --git a/tests/integrations/tso/go.mod b/tests/integrations/tso/go.mod index cea17c73141..1252f121ca5 100644 --- a/tests/integrations/tso/go.mod +++ b/tests/integrations/tso/go.mod @@ -13,7 +13,7 @@ replace google.golang.org/grpc v1.54.0 => google.golang.org/grpc v1.26.0 require ( github.com/pingcap/failpoint v0.0.0-20220801062533-2eaa32854a6c - github.com/pingcap/kvproto v0.0.0-20230911090708-d603cce32b96 + github.com/pingcap/kvproto v0.0.0-20230920042517-db656f45023b github.com/stretchr/testify v1.8.4 github.com/tikv/pd v0.0.0-00010101000000-000000000000 github.com/tikv/pd/client v0.0.0-00010101000000-000000000000 diff --git a/tests/integrations/tso/go.sum b/tests/integrations/tso/go.sum index 5ccf67f8b13..d417ad89432 100644 --- a/tests/integrations/tso/go.sum +++ b/tests/integrations/tso/go.sum @@ -401,8 +401,8 @@ github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c/go.mod h1:X2r9ue github.com/pingcap/failpoint v0.0.0-20220801062533-2eaa32854a6c h1:CgbKAHto5CQgWM9fSBIvaxsJHuGP0uM74HXtv3MyyGQ= github.com/pingcap/failpoint v0.0.0-20220801062533-2eaa32854a6c/go.mod h1:4qGtCB0QK0wBzKtFEGDhxXnSnbQApw1gc9siScUl8ew= github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w= -github.com/pingcap/kvproto v0.0.0-20230911090708-d603cce32b96 h1:Upb52Po0Ev1lPKQdUT4suRwQ5Z49A7gEmJ0trADKftM= -github.com/pingcap/kvproto v0.0.0-20230911090708-d603cce32b96/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= +github.com/pingcap/kvproto v0.0.0-20230920042517-db656f45023b h1:XwwIxepR+uuSYWhdQtstEdr67XUE7X6lpSIHVh5iWjs= +github.com/pingcap/kvproto v0.0.0-20230920042517-db656f45023b/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8IDP+SZrdhV1Kibl9KrHxJ9eciw= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= diff --git a/tests/server/api/api_test.go b/tests/server/api/api_test.go index da86a872045..cc35d9eaab3 100644 --- a/tests/server/api/api_test.go +++ b/tests/server/api/api_test.go @@ -137,6 +137,7 @@ func (suite *middlewareTestSuite) TearDownSuite() { func (suite *middlewareTestSuite) TestRequestInfoMiddleware() { suite.NoError(failpoint.Enable("github.com/tikv/pd/server/api/addRequestInfoMiddleware", "return(true)")) leader := suite.cluster.GetServer(suite.cluster.GetLeader()) + suite.NotNil(leader) input := map[string]interface{}{ "enable-audit": "true", @@ -207,6 +208,7 @@ func BenchmarkDoRequestWithServiceMiddleware(b *testing.B) { func (suite *middlewareTestSuite) TestRateLimitMiddleware() { leader := suite.cluster.GetServer(suite.cluster.GetLeader()) + suite.NotNil(leader) input := map[string]interface{}{ "enable-rate-limit": "true", } @@ -371,6 +373,7 @@ func (suite *middlewareTestSuite) TestRateLimitMiddleware() { func (suite *middlewareTestSuite) TestSwaggerUrl() { leader := suite.cluster.GetServer(suite.cluster.GetLeader()) + suite.NotNil(leader) req, _ := http.NewRequest(http.MethodGet, leader.GetAddr()+"/swagger/ui/index", nil) resp, err := dialClient.Do(req) suite.NoError(err) @@ -380,6 +383,7 @@ func (suite *middlewareTestSuite) TestSwaggerUrl() { func (suite *middlewareTestSuite) TestAuditPrometheusBackend() { leader := suite.cluster.GetServer(suite.cluster.GetLeader()) + suite.NotNil(leader) input := map[string]interface{}{ "enable-audit": "true", } @@ -448,6 +452,7 @@ func (suite *middlewareTestSuite) TestAuditLocalLogBackend() { fname := testutil.InitTempFileLogger("info") defer os.RemoveAll(fname) leader := suite.cluster.GetServer(suite.cluster.GetLeader()) + suite.NotNil(leader) input := map[string]interface{}{ "enable-audit": "true", } diff --git a/tools/pd-api-bench/go.mod b/tools/pd-api-bench/go.mod index bb3d5be530f..21471294afb 100644 --- a/tools/pd-api-bench/go.mod +++ b/tools/pd-api-bench/go.mod @@ -70,7 +70,7 @@ require ( github.com/pingcap/errcode v0.3.0 // indirect github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c // indirect github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 // indirect - github.com/pingcap/kvproto v0.0.0-20230911090708-d603cce32b96 // indirect + github.com/pingcap/kvproto v0.0.0-20230920042517-db656f45023b // indirect github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 // indirect github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect diff --git a/tools/pd-api-bench/go.sum b/tools/pd-api-bench/go.sum index 29d39f504df..6a133983e28 100644 --- a/tools/pd-api-bench/go.sum +++ b/tools/pd-api-bench/go.sum @@ -263,8 +263,8 @@ github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c/go.mod h1:X2r9ue github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 h1:C3N3itkduZXDZFh4N3vQ5HEtld3S+Y+StULhWVvumU0= github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00/go.mod h1:4qGtCB0QK0wBzKtFEGDhxXnSnbQApw1gc9siScUl8ew= github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w= -github.com/pingcap/kvproto v0.0.0-20230911090708-d603cce32b96 h1:Upb52Po0Ev1lPKQdUT4suRwQ5Z49A7gEmJ0trADKftM= -github.com/pingcap/kvproto v0.0.0-20230911090708-d603cce32b96/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= +github.com/pingcap/kvproto v0.0.0-20230920042517-db656f45023b h1:XwwIxepR+uuSYWhdQtstEdr67XUE7X6lpSIHVh5iWjs= +github.com/pingcap/kvproto v0.0.0-20230920042517-db656f45023b/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8IDP+SZrdhV1Kibl9KrHxJ9eciw= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= From 24fffdf718bb62bff1b2cd0d429601ab832ab60c Mon Sep 17 00:00:00 2001 From: Yongbo Jiang Date: Wed, 20 Sep 2023 14:19:43 +0800 Subject: [PATCH 06/14] scatter: fix incorrect judgment condition (#7111) close tikv/pd#7109 Signed-off-by: Cabinfever_B Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/schedule/scatter/region_scatterer.go | 2 +- pkg/schedule/scatter/region_scatterer_test.go | 25 ++++++++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/pkg/schedule/scatter/region_scatterer.go b/pkg/schedule/scatter/region_scatterer.go index c47bcd27e91..68d868750e8 100644 --- a/pkg/schedule/scatter/region_scatterer.go +++ b/pkg/schedule/scatter/region_scatterer.go @@ -451,7 +451,7 @@ func (r *RegionScatterer) selectNewPeer(context engineContext, group string, pee originStorePickedCount := uint64(math.MaxUint64) for _, store := range stores { storeCount := context.selectedPeer.Get(store.GetID(), group) - if store.GetID() == peer.GetId() { + if store.GetID() == peer.GetStoreId() { originStorePickedCount = storeCount } // If storeCount is equal to the maxStoreTotalCount, we should skip this store as candidate. diff --git a/pkg/schedule/scatter/region_scatterer_test.go b/pkg/schedule/scatter/region_scatterer_test.go index 0fc7f0967d7..681b863aea6 100644 --- a/pkg/schedule/scatter/region_scatterer_test.go +++ b/pkg/schedule/scatter/region_scatterer_test.go @@ -67,6 +67,7 @@ func TestScatterRegions(t *testing.T) { scatter(re, 5, 50, true) scatter(re, 5, 500, true) scatter(re, 6, 50, true) + scatter(re, 7, 71, true) scatter(re, 5, 50, false) scatterSpecial(re, 3, 6, 50) scatterSpecial(re, 5, 5, 50) @@ -132,20 +133,36 @@ func scatter(re *require.Assertions, numStores, numRegions uint64, useRules bool } } } + maxStorePeerTotalCount := uint64(0) + minStorePeerTotalCount := uint64(math.MaxUint64) // Each store should have the same number of peers. for _, count := range countPeers { - re.LessOrEqual(float64(count), 1.1*float64(numRegions*3)/float64(numStores)) - re.GreaterOrEqual(float64(count), 0.9*float64(numRegions*3)/float64(numStores)) + if count > maxStorePeerTotalCount { + maxStorePeerTotalCount = count + } + if count < minStorePeerTotalCount { + minStorePeerTotalCount = count + } } + re.LessOrEqual(maxStorePeerTotalCount-minStorePeerTotalCount, uint64(1)) // Each store should have the same number of leaders. re.Len(countPeers, int(numStores)) re.Len(countLeader, int(numStores)) + + maxStoreLeaderTotalCount := uint64(0) + minStoreLeaderTotalCount := uint64(math.MaxUint64) for _, count := range countLeader { - re.LessOrEqual(float64(count), 1.1*float64(numRegions)/float64(numStores)) - re.GreaterOrEqual(float64(count), 0.9*float64(numRegions)/float64(numStores)) + if count > maxStoreLeaderTotalCount { + maxStoreLeaderTotalCount = count + } + if count < minStoreLeaderTotalCount { + minStoreLeaderTotalCount = count + } } + // Since the scatter leader depends on the scatter result of the peer, the maximum difference is 2. + re.LessOrEqual(maxStoreLeaderTotalCount-minStoreLeaderTotalCount, uint64(2)) re.GreaterOrEqual(noNeedMoveNum, 0) } From 925181c073ecdfb286c7f155f3a4f63071500f8b Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Wed, 20 Sep 2023 15:52:44 +0800 Subject: [PATCH 07/14] metrics: rename `mergeCheckerSpecialPeerCounter` to `mergeCheckerUnhealthyRegionCounter` (#7117) ref tikv/pd#4399 Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/schedule/checker/merge_checker.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/schedule/checker/merge_checker.go b/pkg/schedule/checker/merge_checker.go index 0243bfbe165..1ce7bddd1dc 100644 --- a/pkg/schedule/checker/merge_checker.go +++ b/pkg/schedule/checker/merge_checker.go @@ -53,9 +53,9 @@ var ( mergeCheckerPausedCounter = checkerCounter.WithLabelValues(mergeCheckerName, "paused") mergeCheckerRecentlySplitCounter = checkerCounter.WithLabelValues(mergeCheckerName, "recently-split") mergeCheckerRecentlyStartCounter = checkerCounter.WithLabelValues(mergeCheckerName, "recently-start") - mergeCheckerSkipUninitRegionCounter = checkerCounter.WithLabelValues(mergeCheckerName, "skip-uninit-region") + mergeCheckerNoLeaderCounter = checkerCounter.WithLabelValues(mergeCheckerName, "no-leader") mergeCheckerNoNeedCounter = checkerCounter.WithLabelValues(mergeCheckerName, "no-need") - mergeCheckerSpecialPeerCounter = checkerCounter.WithLabelValues(mergeCheckerName, "special-peer") + mergeCheckerUnhealthyRegionCounter = checkerCounter.WithLabelValues(mergeCheckerName, "unhealthy-region") mergeCheckerAbnormalReplicaCounter = checkerCounter.WithLabelValues(mergeCheckerName, "abnormal-replica") mergeCheckerHotRegionCounter = checkerCounter.WithLabelValues(mergeCheckerName, "hot-region") mergeCheckerNoTargetCounter = checkerCounter.WithLabelValues(mergeCheckerName, "no-target") @@ -129,7 +129,7 @@ func (m *MergeChecker) Check(region *core.RegionInfo) []*operator.Operator { // when pd just started, it will load region meta from region storage, if region.GetLeader() == nil { - mergeCheckerSkipUninitRegionCounter.Inc() + mergeCheckerNoLeaderCounter.Inc() return nil } @@ -141,7 +141,7 @@ func (m *MergeChecker) Check(region *core.RegionInfo) []*operator.Operator { // skip region has down peers or pending peers if !filter.IsRegionHealthy(region) { - mergeCheckerSpecialPeerCounter.Inc() + mergeCheckerUnhealthyRegionCounter.Inc() return nil } From a9973165571504f1f9091e9887d0ea9e68517485 Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Thu, 21 Sep 2023 10:30:13 +0800 Subject: [PATCH 08/14] cluster: handle region after report split (#6867) close tikv/pd#4157, close tikv/tikv#15210 Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/core/region.go | 40 +++++---- pkg/core/region_test.go | 4 +- pkg/core/store.go | 39 ++++---- pkg/core/store_option.go | 7 ++ pkg/mcs/scheduling/server/cluster.go | 8 +- pkg/schedule/filter/counter.go | 2 + pkg/schedule/filter/counter_test.go | 2 +- pkg/schedule/filter/filters.go | 13 ++- pkg/schedule/filter/region_filters.go | 45 ++++++---- pkg/schedule/filter/status.go | 5 +- pkg/schedule/plan/status.go | 5 +- pkg/schedule/schedulers/balance_leader.go | 13 +-- pkg/schedule/schedulers/balance_test.go | 8 ++ pkg/syncer/client.go | 4 +- server/cluster/cluster.go | 24 ++--- server/cluster/cluster_worker.go | 105 ++++++++++++++-------- server/cluster/cluster_worker_test.go | 101 +++++++++++++++++++-- server/grpc_service.go | 32 ++++++- 18 files changed, 331 insertions(+), 126 deletions(-) diff --git a/pkg/core/region.go b/pkg/core/region.go index 4540f7aafb3..2fec30de132 100644 --- a/pkg/core/region.go +++ b/pkg/core/region.go @@ -682,9 +682,14 @@ func (r *RegionInfo) isRegionRecreated() bool { return r.GetRegionEpoch().GetVersion() == 1 && r.GetRegionEpoch().GetConfVer() == 1 && (len(r.GetStartKey()) != 0 || len(r.GetEndKey()) != 0) } +// RegionChanged is a struct that records the changes of the region. +type RegionChanged struct { + IsNew, SaveKV, SaveCache, NeedSync bool +} + // RegionGuideFunc is a function that determines which follow-up operations need to be performed based on the origin // and new region information. -type RegionGuideFunc func(region, origin *RegionInfo) (isNew, saveKV, saveCache, needSync bool) +type RegionGuideFunc func(region, origin *RegionInfo) *RegionChanged // GenerateRegionGuideFunc is used to generate a RegionGuideFunc. Control the log output by specifying the log function. // nil means do not print the log. @@ -697,18 +702,19 @@ func GenerateRegionGuideFunc(enableLog bool) RegionGuideFunc { } // Save to storage if meta is updated. // Save to cache if meta or leader is updated, or contains any down/pending peer. - // Mark isNew if the region in cache does not have leader. - return func(region, origin *RegionInfo) (isNew, saveKV, saveCache, needSync bool) { + // Mark IsNew if the region in cache does not have leader. + return func(region, origin *RegionInfo) (changed *RegionChanged) { + changed = &RegionChanged{} if origin == nil { if log.GetLevel() <= zap.DebugLevel { debug("insert new region", zap.Uint64("region-id", region.GetID()), logutil.ZapRedactStringer("meta-region", RegionToHexMeta(region.GetMeta()))) } - saveKV, saveCache, isNew = true, true, true + changed.SaveKV, changed.SaveCache, changed.IsNew = true, true, true } else { if !origin.IsFromHeartbeat() { - isNew = true + changed.IsNew = true } r := region.GetRegionEpoch() o := origin.GetRegionEpoch() @@ -721,7 +727,7 @@ func GenerateRegionGuideFunc(enableLog bool) RegionGuideFunc { zap.Uint64("new-version", r.GetVersion()), ) } - saveKV, saveCache = true, true + changed.SaveKV, changed.SaveCache = true, true } if r.GetConfVer() > o.GetConfVer() { if log.GetLevel() <= zap.InfoLevel { @@ -732,11 +738,11 @@ func GenerateRegionGuideFunc(enableLog bool) RegionGuideFunc { zap.Uint64("new-confver", r.GetConfVer()), ) } - saveKV, saveCache = true, true + changed.SaveCache, changed.SaveKV = true, true } if region.GetLeader().GetId() != origin.GetLeader().GetId() { if origin.GetLeader().GetId() == 0 { - isNew = true + changed.IsNew = true } else if log.GetLevel() <= zap.InfoLevel { info("leader changed", zap.Uint64("region-id", region.GetID()), @@ -745,17 +751,17 @@ func GenerateRegionGuideFunc(enableLog bool) RegionGuideFunc { ) } // We check it first and do not return because the log is important for us to investigate, - saveCache, needSync = true, true + changed.SaveCache, changed.NeedSync = true, true } if len(region.GetPeers()) != len(origin.GetPeers()) { - saveKV, saveCache = true, true + changed.SaveCache, changed.SaveKV = true, true return } if len(region.GetBuckets().GetKeys()) != len(origin.GetBuckets().GetKeys()) { if log.GetLevel() <= zap.DebugLevel { debug("bucket key changed", zap.Uint64("region-id", region.GetID())) } - saveKV, saveCache = true, true + changed.SaveCache, changed.SaveKV = true, true return } // Once flow has changed, will update the cache. @@ -763,39 +769,39 @@ func GenerateRegionGuideFunc(enableLog bool) RegionGuideFunc { if region.GetRoundBytesWritten() != origin.GetRoundBytesWritten() || region.GetRoundBytesRead() != origin.GetRoundBytesRead() || region.flowRoundDivisor < origin.flowRoundDivisor { - saveCache, needSync = true, true + changed.SaveCache, changed.NeedSync = true, true return } if !SortedPeersStatsEqual(region.GetDownPeers(), origin.GetDownPeers()) { if log.GetLevel() <= zap.DebugLevel { debug("down-peers changed", zap.Uint64("region-id", region.GetID())) } - saveCache, needSync = true, true + changed.SaveCache, changed.NeedSync = true, true return } if !SortedPeersEqual(region.GetPendingPeers(), origin.GetPendingPeers()) { if log.GetLevel() <= zap.DebugLevel { debug("pending-peers changed", zap.Uint64("region-id", region.GetID())) } - saveCache, needSync = true, true + changed.SaveCache, changed.NeedSync = true, true return } if region.GetApproximateSize() != origin.GetApproximateSize() || region.GetApproximateKeys() != origin.GetApproximateKeys() { - saveCache = true + changed.SaveCache = true return } if region.GetReplicationStatus().GetState() != replication_modepb.RegionReplicationState_UNKNOWN && (region.GetReplicationStatus().GetState() != origin.GetReplicationStatus().GetState() || region.GetReplicationStatus().GetStateId() != origin.GetReplicationStatus().GetStateId()) { - saveCache = true + changed.SaveCache = true return } // Do not save to kv, because 1) flashback will be eventually set to // false, 2) flashback changes almost all regions in a cluster. // Saving kv may downgrade PD performance when there are many regions. if region.IsFlashbackChanged(origin) { - saveCache = true + changed.SaveCache = true return } } diff --git a/pkg/core/region_test.go b/pkg/core/region_test.go index 1e6b43fbf96..3b58f5ee15a 100644 --- a/pkg/core/region_test.go +++ b/pkg/core/region_test.go @@ -333,8 +333,8 @@ func TestNeedSync(t *testing.T) { for _, testCase := range testCases { regionA := region.Clone(testCase.optionsA...) regionB := region.Clone(testCase.optionsB...) - _, _, _, needSync := RegionGuide(regionA, regionB) - re.Equal(testCase.needSync, needSync) + changed := RegionGuide(regionA, regionB) + re.Equal(testCase.needSync, changed.NeedSync) } } diff --git a/pkg/core/store.go b/pkg/core/store.go index 1d3362cac0e..cafb443bb7d 100644 --- a/pkg/core/store.go +++ b/pkg/core/store.go @@ -36,6 +36,7 @@ const ( initialMinSpace = 8 * units.GiB // 2^33=8GB slowStoreThreshold = 80 awakenStoreInterval = 10 * time.Minute // 2 * slowScoreRecoveryTime + splitStoreWait = time.Minute // EngineKey is the label key used to indicate engine. EngineKey = "engine" @@ -50,22 +51,23 @@ const ( type StoreInfo struct { meta *metapb.Store *storeStats - pauseLeaderTransfer bool // not allow to be used as source or target of transfer leader - slowStoreEvicted bool // this store has been evicted as a slow store, should not transfer leader to it - slowTrendEvicted bool // this store has been evicted as a slow store by trend, should not transfer leader to it - leaderCount int - regionCount int - learnerCount int - witnessCount int - leaderSize int64 - regionSize int64 - pendingPeerCount int - lastPersistTime time.Time - leaderWeight float64 - regionWeight float64 - limiter storelimit.StoreLimit - minResolvedTS uint64 - lastAwakenTime time.Time + pauseLeaderTransfer bool // not allow to be used as source or target of transfer leader + slowStoreEvicted bool // this store has been evicted as a slow store, should not transfer leader to it + slowTrendEvicted bool // this store has been evicted as a slow store by trend, should not transfer leader to it + leaderCount int + regionCount int + learnerCount int + witnessCount int + leaderSize int64 + regionSize int64 + pendingPeerCount int + lastPersistTime time.Time + leaderWeight float64 + regionWeight float64 + limiter storelimit.StoreLimit + minResolvedTS uint64 + lastAwakenTime time.Time + recentlySplitRegionsTime time.Time } // NewStoreInfo creates StoreInfo with meta data. @@ -539,6 +541,11 @@ func (s *StoreInfo) NeedAwakenStore() bool { return s.GetLastHeartbeatTS().Sub(s.lastAwakenTime) > awakenStoreInterval } +// HasRecentlySplitRegions checks if there are some region are splitted in this store. +func (s *StoreInfo) HasRecentlySplitRegions() bool { + return time.Since(s.recentlySplitRegionsTime) < splitStoreWait +} + var ( // If a store's last heartbeat is storeDisconnectDuration ago, the store will // be marked as disconnected state. The value should be greater than tikv's diff --git a/pkg/core/store_option.go b/pkg/core/store_option.go index 8a2aa1ef089..4d8864ea478 100644 --- a/pkg/core/store_option.go +++ b/pkg/core/store_option.go @@ -274,3 +274,10 @@ func SetLastAwakenTime(lastAwaken time.Time) StoreCreateOption { store.lastAwakenTime = lastAwaken } } + +// SetRecentlySplitRegionsTime sets last split time for the store. +func SetRecentlySplitRegionsTime(recentlySplitRegionsTime time.Time) StoreCreateOption { + return func(store *StoreInfo) { + store.recentlySplitRegionsTime = recentlySplitRegionsTime + } +} diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index 0b9924f230b..81c82d73d33 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -433,8 +433,8 @@ func (c *Cluster) processRegionHeartbeat(region *core.RegionInfo) error { // Save to storage if meta is updated, except for flashback. // Save to cache if meta or leader is updated, or contains any down/pending peer. // Mark isNew if the region in cache does not have leader. - isNew, _, saveCache, _ := core.GenerateRegionGuideFunc(true)(region, origin) - if !saveCache && !isNew { + changed := core.GenerateRegionGuideFunc(true)(region, origin) + if !changed.SaveCache && !changed.IsNew { // Due to some config changes need to update the region stats as well, // so we do some extra checks here. if hasRegionStats && c.regionStats.RegionStatsNeedUpdate(region) { @@ -444,7 +444,7 @@ func (c *Cluster) processRegionHeartbeat(region *core.RegionInfo) error { } var overlaps []*core.RegionInfo - if saveCache { + if changed.SaveCache { // To prevent a concurrent heartbeat of another region from overriding the up-to-date region info by a stale one, // check its validation again here. // @@ -456,7 +456,7 @@ func (c *Cluster) processRegionHeartbeat(region *core.RegionInfo) error { cluster.HandleOverlaps(c, overlaps) } - cluster.Collect(c, region, c.GetRegionStores(region), hasRegionStats, isNew, c.IsPrepared()) + cluster.Collect(c, region, c.GetRegionStores(region), hasRegionStats, changed.IsNew, c.IsPrepared()) return nil } diff --git a/pkg/schedule/filter/counter.go b/pkg/schedule/filter/counter.go index 0120ef5b666..0619bbdde29 100644 --- a/pkg/schedule/filter/counter.go +++ b/pkg/schedule/filter/counter.go @@ -127,6 +127,7 @@ const ( storeStateTooManyPendingPeer storeStateRejectLeader storeStateSlowTrend + storeStateRecentlySplitRegions filtersLen ) @@ -156,6 +157,7 @@ var filters = [filtersLen]string{ "store-state-too-many-pending-peers-filter", "store-state-reject-leader-filter", "store-state-slow-trend-filter", + "store-state-recently-split-regions-filter", } // String implements fmt.Stringer interface. diff --git a/pkg/schedule/filter/counter_test.go b/pkg/schedule/filter/counter_test.go index 067a07f138b..f8b6c0bcb8d 100644 --- a/pkg/schedule/filter/counter_test.go +++ b/pkg/schedule/filter/counter_test.go @@ -27,7 +27,7 @@ func TestString(t *testing.T) { expected string }{ {int(storeStateTombstone), "store-state-tombstone-filter"}, - {int(filtersLen - 1), "store-state-slow-trend-filter"}, + {int(filtersLen - 1), "store-state-recently-split-regions-filter"}, {int(filtersLen), "unknown"}, } diff --git a/pkg/schedule/filter/filters.go b/pkg/schedule/filter/filters.go index 0d188e69180..e76969127d1 100644 --- a/pkg/schedule/filter/filters.go +++ b/pkg/schedule/filter/filters.go @@ -332,6 +332,8 @@ type StoreStateFilter struct { // If it checks failed, the operator will be put back to the waiting queue util the limit is available. // But the scheduler should keep the same with the operator level. OperatorLevel constant.PriorityLevel + // check the store not split recently in it if set true. + ForbidRecentlySplitRegions bool // Reason is used to distinguish the reason of store state filter Reason filterType } @@ -471,6 +473,15 @@ func (f *StoreStateFilter) hasRejectLeaderProperty(conf config.SharedConfigProvi return statusOK } +func (f *StoreStateFilter) hasRecentlySplitRegions(_ config.SharedConfigProvider, store *core.StoreInfo) *plan.Status { + if f.ForbidRecentlySplitRegions && store.HasRecentlySplitRegions() { + f.Reason = storeStateRecentlySplitRegions + return statusStoreRecentlySplitRegions + } + f.Reason = storeStateOK + return statusOK +} + // The condition table. // Y: the condition is temporary (expected to become false soon). // N: the condition is expected to be true for a long time. @@ -499,7 +510,7 @@ func (f *StoreStateFilter) anyConditionMatch(typ int, conf config.SharedConfigPr var funcs []conditionFunc switch typ { case leaderSource: - funcs = []conditionFunc{f.isRemoved, f.isDown, f.pauseLeaderTransfer, f.isDisconnected} + funcs = []conditionFunc{f.isRemoved, f.isDown, f.pauseLeaderTransfer, f.isDisconnected, f.hasRecentlySplitRegions} case regionSource: funcs = []conditionFunc{f.isBusy, f.exceedRemoveLimit, f.tooManySnapshots} case witnessSource: diff --git a/pkg/schedule/filter/region_filters.go b/pkg/schedule/filter/region_filters.go index 799cee7d90c..70cdb8500b0 100644 --- a/pkg/schedule/filter/region_filters.go +++ b/pkg/schedule/filter/region_filters.go @@ -24,24 +24,6 @@ import ( "github.com/tikv/pd/pkg/slice" ) -// SelectRegions selects regions that be selected from the list. -func SelectRegions(regions []*core.RegionInfo, filters ...RegionFilter) []*core.RegionInfo { - return filterRegionsBy(regions, func(r *core.RegionInfo) bool { - return slice.AllOf(filters, func(i int) bool { - return filters[i].Select(r).IsOK() - }) - }) -} - -func filterRegionsBy(regions []*core.RegionInfo, keepPred func(*core.RegionInfo) bool) (selected []*core.RegionInfo) { - for _, s := range regions { - if keepPred(s) { - selected = append(selected, s) - } - } - return -} - // SelectOneRegion selects one region that be selected from the list. func SelectOneRegion(regions []*core.RegionInfo, collector *plan.Collector, filters ...RegionFilter) *core.RegionInfo { for _, r := range regions { @@ -173,7 +155,7 @@ type SnapshotSenderFilter struct { senders map[uint64]struct{} } -// NewSnapshotSendFilter returns creates a RegionFilter that filters regions with witness peer on the specific store. +// NewSnapshotSendFilter returns creates a RegionFilter that filters regions whose leader has sender limit on the specific store. // level should be set as same with the operator priority level. func NewSnapshotSendFilter(stores []*core.StoreInfo, level constant.PriorityLevel) RegionFilter { senders := make(map[uint64]struct{}) @@ -193,3 +175,28 @@ func (f *SnapshotSenderFilter) Select(region *core.RegionInfo) *plan.Status { } return statusRegionLeaderSendSnapshotThrottled } + +// StoreRecentlySplitFilter filer the region whose leader store not recently split regions. +type StoreRecentlySplitFilter struct { + recentlySplitStores map[uint64]struct{} +} + +// NewStoreRecentlySplitFilter returns creates a StoreRecentlySplitFilter. +func NewStoreRecentlySplitFilter(stores []*core.StoreInfo) RegionFilter { + recentlySplitStores := make(map[uint64]struct{}) + for _, store := range stores { + if store.HasRecentlySplitRegions() { + recentlySplitStores[store.GetID()] = struct{}{} + } + } + return &StoreRecentlySplitFilter{recentlySplitStores: recentlySplitStores} +} + +// Select returns ok if the region leader not in the recentlySplitStores. +func (f *StoreRecentlySplitFilter) Select(region *core.RegionInfo) *plan.Status { + leaderStoreID := region.GetLeader().GetStoreId() + if _, ok := f.recentlySplitStores[leaderStoreID]; ok { + return statusStoreRecentlySplitRegions + } + return statusOK +} diff --git a/pkg/schedule/filter/status.go b/pkg/schedule/filter/status.go index 930c59e3ba8..9b6665a2fa7 100644 --- a/pkg/schedule/filter/status.go +++ b/pkg/schedule/filter/status.go @@ -39,8 +39,9 @@ var ( // store config limitation statusStoreRejectLeader = plan.NewStatus(plan.StatusStoreRejectLeader) - statusStoreNotMatchRule = plan.NewStatus(plan.StatusStoreNotMatchRule) - statusStoreNotMatchIsolation = plan.NewStatus(plan.StatusStoreNotMatchIsolation) + statusStoreNotMatchRule = plan.NewStatus(plan.StatusStoreNotMatchRule) + statusStoreNotMatchIsolation = plan.NewStatus(plan.StatusStoreNotMatchIsolation) + statusStoreRecentlySplitRegions = plan.NewStatus(plan.StatusStoreRecentlySplitRegions) // region filter status statusRegionPendingPeer = plan.NewStatus(plan.StatusRegionUnhealthy) diff --git a/pkg/schedule/plan/status.go b/pkg/schedule/plan/status.go index 4242b631493..847d03a17ff 100644 --- a/pkg/schedule/plan/status.go +++ b/pkg/schedule/plan/status.go @@ -72,6 +72,8 @@ const ( StatusStoreLowSpace = iota + 500 // StatusStoreNotExisted represents the store cannot be found in PD. StatusStoreNotExisted + // StatusStoreRecentlySplitRegions represents the store cannot be selected due to the region is splitting. + StatusStoreRecentlySplitRegions ) // TODO: define region status priority @@ -127,7 +129,8 @@ var statusText = map[StatusCode]string{ StatusStoreDown: "StoreDown", StatusStoreBusy: "StoreBusy", - StatusStoreNotExisted: "StoreNotExisted", + StatusStoreNotExisted: "StoreNotExisted", + StatusStoreRecentlySplitRegions: "StoreRecentlySplitRegions", // region StatusRegionHot: "RegionHot", diff --git a/pkg/schedule/schedulers/balance_leader.go b/pkg/schedule/schedulers/balance_leader.go index e5516317f46..46f7fdc29cd 100644 --- a/pkg/schedule/schedulers/balance_leader.go +++ b/pkg/schedule/schedulers/balance_leader.go @@ -48,8 +48,6 @@ const ( // Default value is 4 which is subjected by scheduler-max-waiting-operator and leader-schedule-limit // If you want to increase balance speed more, please increase above-mentioned param. BalanceLeaderBatchSize = 4 - // MaxBalanceLeaderBatchSize is maximum of balance leader batch size - MaxBalanceLeaderBatchSize = 10 transferIn = "transfer-in" transferOut = "transfer-out" @@ -150,7 +148,7 @@ func (handler *balanceLeaderHandler) UpdateConfig(w http.ResponseWriter, r *http handler.rd.JSON(w, httpCode, v) } -func (handler *balanceLeaderHandler) ListConfig(w http.ResponseWriter, r *http.Request) { +func (handler *balanceLeaderHandler) ListConfig(w http.ResponseWriter, _ *http.Request) { conf := handler.config.Clone() handler.rd.JSON(w, http.StatusOK, conf) } @@ -162,6 +160,7 @@ type balanceLeaderScheduler struct { conf *balanceLeaderSchedulerConfig handler http.Handler filters []filter.Filter + regionFilters filter.RegionFilter filterCounter *filter.Counter } @@ -181,7 +180,7 @@ func newBalanceLeaderScheduler(opController *operator.Controller, conf *balanceL option(s) } s.filters = []filter.Filter{ - &filter.StoreStateFilter{ActionScope: s.GetName(), TransferLeader: true, OperatorLevel: constant.High}, + &filter.StoreStateFilter{ActionScope: s.GetName(), TransferLeader: true, ForbidRecentlySplitRegions: true, OperatorLevel: constant.High}, filter.NewSpecialUseFilter(s.GetName()), } return s @@ -277,7 +276,7 @@ func (cs *candidateStores) less(iID uint64, scorei float64, jID uint64, scorej f return scorei > scorej } -// hasStore returns returns true when there are leftover stores. +// hasStore returns true when there are leftover stores. func (cs *candidateStores) hasStore() bool { return cs.index < len(cs.stores) } @@ -349,6 +348,7 @@ func (l *balanceLeaderScheduler) Schedule(cluster sche.SchedulerCluster, dryRun opInfluence := l.OpController.GetOpInfluence(cluster.GetBasicCluster()) kind := constant.NewScheduleKind(constant.LeaderKind, leaderSchedulePolicy) solver := newSolver(basePlan, kind, cluster, opInfluence) + l.regionFilters = filter.NewStoreRecentlySplitFilter(cluster.GetStores()) stores := cluster.GetStores() scoreFunc := func(store *core.StoreInfo) float64 { @@ -486,7 +486,7 @@ func (l *balanceLeaderScheduler) transferLeaderOut(solver *solver, collector *pl // the worst follower peer and transfers the leader. func (l *balanceLeaderScheduler) transferLeaderIn(solver *solver, collector *plan.Collector) *operator.Operator { solver.Region = filter.SelectOneRegion(solver.RandFollowerRegions(solver.TargetStoreID(), l.conf.Ranges), - nil, filter.NewRegionPendingFilter(), filter.NewRegionDownFilter()) + nil, filter.NewRegionPendingFilter(), filter.NewRegionDownFilter(), l.regionFilters) if solver.Region == nil { log.Debug("store has no follower", zap.String("scheduler", l.GetName()), zap.Uint64("store-id", solver.TargetStoreID())) balanceLeaderNoFollowerRegionCounter.Inc() @@ -508,6 +508,7 @@ func (l *balanceLeaderScheduler) transferLeaderIn(solver *solver, collector *pla balanceLeaderNoLeaderRegionCounter.Inc() return nil } + finalFilters := l.filters conf := solver.GetSchedulerConfig() if leaderFilter := filter.NewPlacementLeaderSafeguard(l.GetName(), conf, solver.GetBasicCluster(), solver.GetRuleManager(), solver.Region, solver.Source, false /*allowMoveLeader*/); leaderFilter != nil { diff --git a/pkg/schedule/schedulers/balance_test.go b/pkg/schedule/schedulers/balance_test.go index 54fe8ff489b..3231716c681 100644 --- a/pkg/schedule/schedulers/balance_test.go +++ b/pkg/schedule/schedulers/balance_test.go @@ -20,6 +20,7 @@ import ( "math/rand" "sort" "testing" + "time" "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" @@ -294,6 +295,13 @@ func (suite *balanceLeaderSchedulerTestSuite) TestBalanceLimit() { // Region1: F F F L suite.tc.UpdateLeaderCount(4, 16) suite.NotEmpty(suite.schedule()) + + // can't balance leader from 4 to 1 when store 1 has split in it. + store := suite.tc.GetStore(4) + store = store.Clone(core.SetRecentlySplitRegionsTime(time.Now())) + suite.tc.PutStore(store) + op := suite.schedule() + suite.Empty(op) } func (suite *balanceLeaderSchedulerTestSuite) TestBalanceLeaderSchedulePolicy() { diff --git a/pkg/syncer/client.go b/pkg/syncer/client.go index ac409f90115..b0892a6736a 100644 --- a/pkg/syncer/client.go +++ b/pkg/syncer/client.go @@ -194,7 +194,7 @@ func (s *RegionSyncer) StartSyncWithLeader(addr string) { log.Debug("region is stale", zap.Stringer("origin", origin.GetMeta()), errs.ZapError(err)) continue } - _, saveKV, _, _ := regionGuide(region, origin) + changed := regionGuide(region, origin) overlaps := bc.PutRegion(region) if hasBuckets { @@ -202,7 +202,7 @@ func (s *RegionSyncer) StartSyncWithLeader(addr string) { region.UpdateBuckets(buckets[i], old) } } - if saveKV { + if changed.SaveKV { err = regionStorage.SaveRegion(r) } if err == nil { diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index dbd640d6e8c..771fb03ac20 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -1113,12 +1113,16 @@ func (c *RaftCluster) processRegionHeartbeat(region *core.RegionInfo) error { cluster.HandleStatsAsync(c, region) } - hasRegionStats := c.regionStats != nil - // Save to storage if meta is updated, except for flashback. // Save to cache if meta or leader is updated, or contains any down/pending peer. // Mark isNew if the region in cache does not have leader. - isNew, saveKV, saveCache, needSync := regionGuide(region, origin) - if !c.isAPIServiceMode && !saveKV && !saveCache && !isNew { + changed := regionGuide(region, origin) + return c.SaveRegion(region, changed) +} + +// SaveRegion saves region info into cache and PD storage. +func (c *RaftCluster) SaveRegion(region *core.RegionInfo, changed *core.RegionChanged) (err error) { + hasRegionStats := c.regionStats != nil + if !c.isAPIServiceMode && !changed.SaveKV && !changed.SaveCache && !changed.IsNew { // Due to some config changes need to update the region stats as well, // so we do some extra checks here. if hasRegionStats && c.regionStats.RegionStatsNeedUpdate(region) { @@ -1132,14 +1136,15 @@ func (c *RaftCluster) processRegionHeartbeat(region *core.RegionInfo) error { }) var overlaps []*core.RegionInfo - if saveCache { + + if changed.SaveCache { failpoint.Inject("decEpoch", func() { region = region.Clone(core.SetRegionConfVer(2), core.SetRegionVersion(2)) }) // To prevent a concurrent heartbeat of another region from overriding the up-to-date region info by a stale one, // check its validation again here. // - // However it can't solve the race condition of concurrent heartbeats from the same region. + // However, it can't solve the race condition of concurrent heartbeats from the same region. if overlaps, err = c.core.AtomicCheckAndPutRegion(region); err != nil { return err } @@ -1150,7 +1155,7 @@ func (c *RaftCluster) processRegionHeartbeat(region *core.RegionInfo) error { } if !c.isAPIServiceMode { - cluster.Collect(c, region, c.GetRegionStores(region), hasRegionStats, isNew, c.IsPrepared()) + cluster.Collect(c, region, c.GetRegionStores(region), hasRegionStats, changed.IsNew, c.IsPrepared()) } if c.storage != nil { @@ -1166,7 +1171,7 @@ func (c *RaftCluster) processRegionHeartbeat(region *core.RegionInfo) error { errs.ZapError(err)) } } - if saveKV { + if changed.SaveKV { if err := c.storage.SaveRegion(region.GetMeta()); err != nil { log.Error("failed to save region to storage", zap.Uint64("region-id", region.GetID()), @@ -1177,13 +1182,12 @@ func (c *RaftCluster) processRegionHeartbeat(region *core.RegionInfo) error { } } - if saveKV || needSync { + if changed.SaveKV || changed.NeedSync { select { case c.changedRegions <- region: default: } } - return nil } diff --git a/server/cluster/cluster_worker.go b/server/cluster/cluster_worker.go index c1da97363b5..3036fe95b3e 100644 --- a/server/cluster/cluster_worker.go +++ b/server/cluster/cluster_worker.go @@ -16,6 +16,8 @@ package cluster import ( "bytes" + "fmt" + "time" "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" @@ -26,11 +28,13 @@ import ( "github.com/tikv/pd/pkg/schedule/operator" "github.com/tikv/pd/pkg/statistics/buckets" "github.com/tikv/pd/pkg/utils/logutil" - "github.com/tikv/pd/pkg/utils/typeutil" "github.com/tikv/pd/pkg/versioninfo" "go.uber.org/zap" ) +// store doesn't pick balance leader source if the split region is bigger than maxSplitThreshold. +const maxSplitThreshold = 10 + // HandleRegionHeartbeat processes RegionInfo reports from client. func (c *RaftCluster) HandleRegionHeartbeat(region *core.RegionInfo) error { if err := c.processRegionHeartbeat(region); err != nil { @@ -41,6 +45,58 @@ func (c *RaftCluster) HandleRegionHeartbeat(region *core.RegionInfo) error { return nil } +// ProcessRegionSplit to process split region into region cache. +// it's different with the region heartbeat, it's only fill some new region into the region cache. +// so it doesn't consider the leader and hot statistics. +func (c *RaftCluster) ProcessRegionSplit(regions []*metapb.Region) []error { + if err := c.checkSplitRegions(regions); err != nil { + return []error{err} + } + total := len(regions) - 1 + regions[0], regions[total] = regions[total], regions[0] + leaderStoreID := uint64(0) + if r := c.core.GetRegion(regions[0].GetId()); r != nil { + leaderStoreID = r.GetLeader().GetStoreId() + } + if leaderStoreID == 0 { + return []error{errors.New("origin region no leader")} + } + leaderStore := c.GetStore(leaderStoreID) + if leaderStore == nil { + return []error{errors.New("leader store not found")} + } + errList := make([]error, 0, total) + for _, region := range regions { + if len(region.GetPeers()) == 0 { + errList = append(errList, errors.New(fmt.Sprintf("region:%d has no peer", region.GetId()))) + continue + } + // region split initiator store will be leader with a high probability + leader := region.Peers[0] + if leaderStoreID > 0 { + for _, peer := range region.GetPeers() { + if peer.GetStoreId() == leaderStoreID { + leader = peer + break + } + } + } + region := core.NewRegionInfo(region, leader) + changed := &core.RegionChanged{ + IsNew: true, SaveKV: true, SaveCache: true, NeedSync: true, + } + if err := c.SaveRegion(region, changed); err != nil { + errList = append(errList, err) + } + } + // If the number of regions exceeds the threshold, update the last split time. + if len(regions) >= maxSplitThreshold { + newStore := leaderStore.Clone(core.SetRecentlySplitRegionsTime(time.Now())) + c.core.PutStore(newStore) + } + return errList +} + // HandleAskSplit handles the split request. func (c *RaftCluster) HandleAskSplit(request *pdpb.AskSplitRequest) (*pdpb.AskSplitResponse, error) { if c.isSchedulingHalted() { @@ -165,22 +221,6 @@ func (c *RaftCluster) HandleAskBatchSplit(request *pdpb.AskBatchSplitRequest) (* return resp, nil } -func (c *RaftCluster) checkSplitRegion(left *metapb.Region, right *metapb.Region) error { - if left == nil || right == nil { - return errors.New("invalid split region") - } - - if !bytes.Equal(left.GetEndKey(), right.GetStartKey()) { - return errors.New("invalid split region") - } - - if len(right.GetEndKey()) == 0 || bytes.Compare(left.GetStartKey(), right.GetEndKey()) < 0 { - return nil - } - - return errors.New("invalid split region") -} - func (c *RaftCluster) checkSplitRegions(regions []*metapb.Region) error { if len(regions) <= 1 { return errors.New("invalid split region") @@ -204,21 +244,18 @@ func (c *RaftCluster) HandleReportSplit(request *pdpb.ReportSplitRequest) (*pdpb left := request.GetLeft() right := request.GetRight() - err := c.checkSplitRegion(left, right) - if err != nil { + if errs := c.ProcessRegionSplit([]*metapb.Region{left, right}); len(errs) > 0 { log.Warn("report split region is invalid", logutil.ZapRedactStringer("left-region", core.RegionToHexMeta(left)), logutil.ZapRedactStringer("right-region", core.RegionToHexMeta(right)), - errs.ZapError(err)) - return nil, err + zap.Errors("errs", errs), + ) + // error[0] may be checker error, others are ignored. + return nil, errs[0] } - // Build origin region by using left and right. - originRegion := typeutil.DeepClone(right, core.RegionFactory) - originRegion.RegionEpoch = nil - originRegion.StartKey = left.GetStartKey() log.Info("region split, generate new region", - zap.Uint64("region-id", originRegion.GetId()), + zap.Uint64("region-id", right.GetId()), logutil.ZapRedactStringer("region-meta", core.RegionToHexMeta(left))) return &pdpb.ReportSplitResponse{}, nil } @@ -226,21 +263,19 @@ func (c *RaftCluster) HandleReportSplit(request *pdpb.ReportSplitRequest) (*pdpb // HandleBatchReportSplit handles the batch report split request. func (c *RaftCluster) HandleBatchReportSplit(request *pdpb.ReportBatchSplitRequest) (*pdpb.ReportBatchSplitResponse, error) { regions := request.GetRegions() - hrm := core.RegionsToHexMeta(regions) - err := c.checkSplitRegions(regions) - if err != nil { + if errs := c.ProcessRegionSplit(regions); len(errs) > 0 { log.Warn("report batch split region is invalid", zap.Stringer("region-meta", hrm), - errs.ZapError(err)) - return nil, err + zap.Errors("errs", errs)) + // error[0] may be checker error, others are ignored. + return nil, errs[0] } last := len(regions) - 1 - originRegion := typeutil.DeepClone(regions[last], core.RegionFactory) - hrm = core.RegionsToHexMeta(regions[:last]) + originRegionID := regions[last].GetId() log.Info("region batch split, generate new regions", - zap.Uint64("region-id", originRegion.GetId()), - zap.Stringer("origin", hrm), + zap.Uint64("region-id", originRegionID), + zap.Stringer("new-peer", hrm[:last]), zap.Int("total", last)) return &pdpb.ReportBatchSplitResponse{}, nil } diff --git a/server/cluster/cluster_worker_test.go b/server/cluster/cluster_worker_test.go index b376b38edc3..98b9b8380f1 100644 --- a/server/cluster/cluster_worker_test.go +++ b/server/cluster/cluster_worker_test.go @@ -23,9 +23,23 @@ import ( "github.com/stretchr/testify/require" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/mock/mockid" + "github.com/tikv/pd/pkg/schedule" "github.com/tikv/pd/pkg/storage" ) +func mockRegionPeer(cluster *RaftCluster, voters []uint64) []*metapb.Peer { + rst := make([]*metapb.Peer, len(voters)) + for i, v := range voters { + id, _ := cluster.AllocID() + rst[i] = &metapb.Peer{ + Id: id, + StoreId: v, + Role: metapb.PeerRole_Voter, + } + } + return rst +} + func TestReportSplit(t *testing.T) { re := require.New(t) ctx, cancel := context.WithCancel(context.Background()) @@ -34,12 +48,56 @@ func TestReportSplit(t *testing.T) { _, opt, err := newTestScheduleConfig() re.NoError(err) cluster := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend(), core.NewBasicCluster()) - left := &metapb.Region{Id: 1, StartKey: []byte("a"), EndKey: []byte("b")} - right := &metapb.Region{Id: 2, StartKey: []byte("b"), EndKey: []byte("c")} - _, err = cluster.HandleReportSplit(&pdpb.ReportSplitRequest{Left: left, Right: right}) - re.NoError(err) + cluster.coordinator = schedule.NewCoordinator(cluster.ctx, cluster, nil) + right := &metapb.Region{Id: 1, StartKey: []byte("a"), EndKey: []byte("c"), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3}), + RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 1}} + region := core.NewRegionInfo(right, right.Peers[0]) + cluster.putRegion(region) + store := newTestStores(1, "2.0.0") + cluster.core.PutStore(store[0]) + + // split failed, split region keys must be continuous. + left := &metapb.Region{Id: 2, StartKey: []byte("a"), EndKey: []byte("b"), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3}), + RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 2}} _, err = cluster.HandleReportSplit(&pdpb.ReportSplitRequest{Left: right, Right: left}) re.Error(err) + + // split success with continuous region keys. + right = &metapb.Region{Id: 1, StartKey: []byte("b"), EndKey: []byte("c"), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3}), + RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 2}} + _, err = cluster.HandleReportSplit(&pdpb.ReportSplitRequest{Left: left, Right: right}) + re.NoError(err) + // no range hole + storeID := region.GetLeader().GetStoreId() + re.Equal(storeID, cluster.GetRegionByKey([]byte("b")).GetLeader().GetStoreId()) + re.Equal(storeID, cluster.GetRegionByKey([]byte("a")).GetLeader().GetStoreId()) + re.Equal(uint64(1), cluster.GetRegionByKey([]byte("b")).GetID()) + re.Equal(uint64(2), cluster.GetRegionByKey([]byte("a")).GetID()) + + testdata := []struct { + regionID uint64 + startKey []byte + endKey []byte + }{ + { + regionID: 1, + startKey: []byte("b"), + endKey: []byte("c"), + }, { + regionID: 2, + startKey: []byte("a"), + endKey: []byte("b"), + }, + } + + for _, data := range testdata { + r := metapb.Region{} + ok, err := cluster.storage.LoadRegion(data.regionID, &r) + re.NoError(err) + re.True(ok) + re.Equal(data.startKey, r.GetStartKey()) + re.Equal(data.endKey, r.GetEndKey()) + } } func TestReportBatchSplit(t *testing.T) { @@ -50,12 +108,39 @@ func TestReportBatchSplit(t *testing.T) { _, opt, err := newTestScheduleConfig() re.NoError(err) cluster := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend(), core.NewBasicCluster()) + cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) + store := newTestStores(1, "2.0.0") + cluster.core.PutStore(store[0]) + re.False(cluster.GetStore(1).HasRecentlySplitRegions()) regions := []*metapb.Region{ - {Id: 1, StartKey: []byte(""), EndKey: []byte("a")}, - {Id: 2, StartKey: []byte("a"), EndKey: []byte("b")}, - {Id: 3, StartKey: []byte("b"), EndKey: []byte("c")}, - {Id: 3, StartKey: []byte("c"), EndKey: []byte("")}, + {Id: 1, StartKey: []byte(""), EndKey: []byte("a"), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3})}, + {Id: 2, StartKey: []byte("a"), EndKey: []byte("b"), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3})}, + {Id: 3, StartKey: []byte("b"), EndKey: []byte("c"), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3})}, + {Id: 4, StartKey: []byte("c"), EndKey: []byte(""), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3})}, + } + _, err = cluster.HandleBatchReportSplit(&pdpb.ReportBatchSplitRequest{Regions: regions}) + re.Error(err) + + meta := &metapb.Region{Id: 1, StartKey: []byte(""), EndKey: []byte(""), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3}), + RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 1}} + region := core.NewRegionInfo(meta, meta.Peers[0]) + cluster.putRegion(region) + + regions = []*metapb.Region{ + {Id: 2, StartKey: []byte(""), EndKey: []byte("a"), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3}), RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 2}}, + {Id: 3, StartKey: []byte("a"), EndKey: []byte("b"), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3}), RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 2}}, + {Id: 4, StartKey: []byte("b"), EndKey: []byte("c"), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3}), RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 2}}, + {Id: 5, StartKey: []byte("c"), EndKey: []byte("d"), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3}), RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 2}}, + {Id: 6, StartKey: []byte("d"), EndKey: []byte("e"), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3}), RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 2}}, + {Id: 7, StartKey: []byte("e"), EndKey: []byte("f"), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3}), RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 2}}, + {Id: 8, StartKey: []byte("f"), EndKey: []byte("g"), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3}), RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 2}}, + {Id: 9, StartKey: []byte("g"), EndKey: []byte("h"), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3}), RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 2}}, + {Id: 10, StartKey: []byte("h"), EndKey: []byte("i"), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3}), RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 2}}, + + {Id: 1, StartKey: []byte("i"), EndKey: []byte(""), Peers: mockRegionPeer(cluster, []uint64{1, 2, 3}), RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 2}}, } _, err = cluster.HandleBatchReportSplit(&pdpb.ReportBatchSplitRequest{Regions: regions}) re.NoError(err) + + re.True(cluster.GetStore(1).HasRecentlySplitRegions()) } diff --git a/server/grpc_service.go b/server/grpc_service.go index 5e40bc1c732..d218c2bb0b6 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -1428,10 +1428,24 @@ func (s *GrpcServer) GetRegion(ctx context.Context, request *pdpb.GetRegionReque if rc == nil { return &pdpb.GetRegionResponse{Header: s.notBootstrappedHeader()}, nil } - region := rc.GetRegionByKey(request.GetRegionKey()) + var region *core.RegionInfo + // allow region miss temporarily if this key can't be found in the region tree. +retryLoop: + for retry := 0; retry <= 10; retry++ { + region = rc.GetRegionByKey(request.GetRegionKey()) + if region != nil { + break retryLoop + } + select { + case <-ctx.Done(): + break retryLoop + case <-time.After(10 * time.Millisecond): + } + } if region == nil { return &pdpb.GetRegionResponse{Header: s.header()}, nil } + var buckets *metapb.Buckets if rc.GetStoreConfig().IsEnableRegionBucket() && request.GetNeedBuckets() { buckets = region.GetBuckets() @@ -1473,7 +1487,21 @@ func (s *GrpcServer) GetPrevRegion(ctx context.Context, request *pdpb.GetRegionR return &pdpb.GetRegionResponse{Header: s.notBootstrappedHeader()}, nil } - region := rc.GetPrevRegionByKey(request.GetRegionKey()) + var region *core.RegionInfo + // allow region miss temporarily if this key can't be found in the region tree. +retryLoop: + for retry := 0; retry <= 10; retry++ { + region = rc.GetPrevRegionByKey(request.GetRegionKey()) + if region != nil { + break retryLoop + } + select { + case <-ctx.Done(): + break retryLoop + case <-time.After(10 * time.Millisecond): + } + } + if region == nil { return &pdpb.GetRegionResponse{Header: s.header()}, nil } From 62ff67afbdcc3ab6c22662f7a415090e907b562d Mon Sep 17 00:00:00 2001 From: guo-shaoge Date: Thu, 21 Sep 2023 11:03:13 +0800 Subject: [PATCH 09/14] mcs/resourcemanager: add metric for TiFlash RU consumption (#7115) close tikv/pd#7116 add metric for tiflash ru consumption Signed-off-by: guo-shaoge Co-authored-by: ShuNing Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../resourcemanager/server/grpc_service.go | 7 ++++++- pkg/mcs/resourcemanager/server/manager.go | 21 ++++++++++++------- pkg/mcs/resourcemanager/server/metrics.go | 2 ++ 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/pkg/mcs/resourcemanager/server/grpc_service.go b/pkg/mcs/resourcemanager/server/grpc_service.go index 5c1b5f0e458..d0fac920f2f 100644 --- a/pkg/mcs/resourcemanager/server/grpc_service.go +++ b/pkg/mcs/resourcemanager/server/grpc_service.go @@ -191,11 +191,16 @@ func (s *Service) AcquireTokenBuckets(stream rmpb.ResourceManager_AcquireTokenBu } // Send the consumption to update the metrics. isBackground := req.GetIsBackground() + isTiFlash := req.GetIsTiflash() + if isBackground && isTiFlash { + return errors.New("background and tiflash cannot be true at the same time") + } s.manager.consumptionDispatcher <- struct { resourceGroupName string *rmpb.Consumption isBackground bool - }{resourceGroupName, req.GetConsumptionSinceLastRequest(), isBackground} + isTiFlash bool + }{resourceGroupName, req.GetConsumptionSinceLastRequest(), isBackground, isTiFlash} if isBackground { continue } diff --git a/pkg/mcs/resourcemanager/server/manager.go b/pkg/mcs/resourcemanager/server/manager.go index 21866ee1156..df237bd0feb 100644 --- a/pkg/mcs/resourcemanager/server/manager.go +++ b/pkg/mcs/resourcemanager/server/manager.go @@ -60,6 +60,7 @@ type Manager struct { resourceGroupName string *rmpb.Consumption isBackground bool + isTiFlash bool } // record update time of each resource group consumptionRecord map[string]time.Time @@ -81,6 +82,7 @@ func NewManager[T ConfigProvider](srv bs.Server) *Manager { resourceGroupName string *rmpb.Consumption isBackground bool + isTiFlash bool }, defaultConsumptionChanSize), consumptionRecord: make(map[string]time.Time), } @@ -361,20 +363,23 @@ func (m *Manager) backgroundMetricsFlush(ctx context.Context) { if consumption == nil { continue } - backgroundType := "" + ruLabelType := tidbTypeLabel if consumptionInfo.isBackground { - backgroundType = backgroundTypeLabel + ruLabelType = backgroundTypeLabel + } + if consumptionInfo.isTiFlash { + ruLabelType = tiflashTypeLabel } var ( name = consumptionInfo.resourceGroupName - rruMetrics = readRequestUnitCost.WithLabelValues(name, backgroundType) - wruMetrics = writeRequestUnitCost.WithLabelValues(name, backgroundType) + rruMetrics = readRequestUnitCost.WithLabelValues(name, ruLabelType) + wruMetrics = writeRequestUnitCost.WithLabelValues(name, ruLabelType) sqlLayerRuMetrics = sqlLayerRequestUnitCost.WithLabelValues(name) - readByteMetrics = readByteCost.WithLabelValues(name, backgroundType) - writeByteMetrics = writeByteCost.WithLabelValues(name, backgroundType) - kvCPUMetrics = kvCPUCost.WithLabelValues(name, backgroundType) - sqlCPUMetrics = sqlCPUCost.WithLabelValues(name, backgroundType) + readByteMetrics = readByteCost.WithLabelValues(name, ruLabelType) + writeByteMetrics = writeByteCost.WithLabelValues(name, ruLabelType) + kvCPUMetrics = kvCPUCost.WithLabelValues(name, ruLabelType) + sqlCPUMetrics = sqlCPUCost.WithLabelValues(name, ruLabelType) readRequestCountMetrics = requestCount.WithLabelValues(name, readTypeLabel) writeRequestCountMetrics = requestCount.WithLabelValues(name, writeTypeLabel) ) diff --git a/pkg/mcs/resourcemanager/server/metrics.go b/pkg/mcs/resourcemanager/server/metrics.go index 083c44894ef..184eddc8ef9 100644 --- a/pkg/mcs/resourcemanager/server/metrics.go +++ b/pkg/mcs/resourcemanager/server/metrics.go @@ -26,6 +26,8 @@ const ( readTypeLabel = "read" writeTypeLabel = "write" backgroundTypeLabel = "background" + tiflashTypeLabel = "tiflash" + tidbTypeLabel = "tidb" ) var ( From 5b3d0172b0db653a97257373fac9ce9d9699c6f5 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Thu, 21 Sep 2023 14:24:13 +0800 Subject: [PATCH 10/14] *: fix sync isolation level to default placement rule (#7122) close tikv/pd#7121 Signed-off-by: Ryan Leung Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/mcs/scheduling/server/cluster.go | 2 +- pkg/mock/mockcluster/mockcluster.go | 2 +- pkg/schedule/checker/rule_checker_test.go | 35 ++++++++++++++++++ pkg/schedule/placement/rule_manager.go | 5 ++- pkg/schedule/placement/rule_manager_test.go | 6 ++-- pkg/statistics/region_collection_test.go | 4 +-- server/api/operator_test.go | 4 ++- server/cluster/cluster.go | 2 +- server/cluster/cluster_test.go | 10 +++--- server/config/persist_options.go | 7 ++++ server/server.go | 11 +++--- tests/pdctl/config/config_test.go | 40 ++++++++++++++++++--- 12 files changed, 104 insertions(+), 24 deletions(-) diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index 81c82d73d33..b2986f722df 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -75,7 +75,7 @@ func NewCluster(parentCtx context.Context, persistConfig *config.PersistConfig, checkMembershipCh: checkMembershipCh, } c.coordinator = schedule.NewCoordinator(ctx, c, hbStreams) - err = c.ruleManager.Initialize(persistConfig.GetMaxReplicas(), persistConfig.GetLocationLabels()) + err = c.ruleManager.Initialize(persistConfig.GetMaxReplicas(), persistConfig.GetLocationLabels(), persistConfig.GetIsolationLevel()) if err != nil { cancel() return nil, err diff --git a/pkg/mock/mockcluster/mockcluster.go b/pkg/mock/mockcluster/mockcluster.go index 1ed7ab4eb9f..01282b40534 100644 --- a/pkg/mock/mockcluster/mockcluster.go +++ b/pkg/mock/mockcluster/mockcluster.go @@ -213,7 +213,7 @@ func (mc *Cluster) AllocPeer(storeID uint64) (*metapb.Peer, error) { func (mc *Cluster) initRuleManager() { if mc.RuleManager == nil { mc.RuleManager = placement.NewRuleManager(mc.GetStorage(), mc, mc.GetSharedConfig()) - mc.RuleManager.Initialize(int(mc.GetReplicationConfig().MaxReplicas), mc.GetReplicationConfig().LocationLabels) + mc.RuleManager.Initialize(int(mc.GetReplicationConfig().MaxReplicas), mc.GetReplicationConfig().LocationLabels, mc.GetReplicationConfig().IsolationLevel) } } diff --git a/pkg/schedule/checker/rule_checker_test.go b/pkg/schedule/checker/rule_checker_test.go index cbd7624f3b1..ad140e91606 100644 --- a/pkg/schedule/checker/rule_checker_test.go +++ b/pkg/schedule/checker/rule_checker_test.go @@ -112,6 +112,41 @@ func (suite *ruleCheckerTestSuite) TestAddRulePeerWithIsolationLevel() { suite.Equal(uint64(4), op.Step(0).(operator.AddLearner).ToStore) } +func (suite *ruleCheckerTestSuite) TestReplaceDownPeerWithIsolationLevel() { + suite.cluster.SetMaxStoreDownTime(100 * time.Millisecond) + suite.cluster.AddLabelsStore(1, 1, map[string]string{"zone": "z1", "host": "h1"}) + suite.cluster.AddLabelsStore(2, 1, map[string]string{"zone": "z1", "host": "h2"}) + suite.cluster.AddLabelsStore(3, 1, map[string]string{"zone": "z2", "host": "h3"}) + suite.cluster.AddLabelsStore(4, 1, map[string]string{"zone": "z2", "host": "h4"}) + suite.cluster.AddLabelsStore(5, 1, map[string]string{"zone": "z3", "host": "h5"}) + suite.cluster.AddLabelsStore(6, 1, map[string]string{"zone": "z3", "host": "h6"}) + suite.cluster.AddLeaderRegionWithRange(1, "", "", 1, 3, 5) + suite.ruleManager.DeleteRule("pd", "default") + suite.ruleManager.SetRule(&placement.Rule{ + GroupID: "pd", + ID: "test", + Index: 100, + Override: true, + Role: placement.Voter, + Count: 3, + LocationLabels: []string{"zone", "host"}, + IsolationLevel: "zone", + }) + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.Nil(op) + region := suite.cluster.GetRegion(1) + downPeer := []*pdpb.PeerStats{ + {Peer: region.GetStorePeer(5), DownSeconds: 6000}, + } + region = region.Clone(core.WithDownPeers(downPeer)) + suite.cluster.PutRegion(region) + suite.cluster.SetStoreDown(5) + suite.cluster.SetStoreDown(6) + time.Sleep(200 * time.Millisecond) + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.Nil(op) +} + func (suite *ruleCheckerTestSuite) TestFixPeer() { suite.cluster.AddLeaderStore(1, 1) suite.cluster.AddLeaderStore(2, 1) diff --git a/pkg/schedule/placement/rule_manager.go b/pkg/schedule/placement/rule_manager.go index 3bd272a00ac..909c0fa1078 100644 --- a/pkg/schedule/placement/rule_manager.go +++ b/pkg/schedule/placement/rule_manager.go @@ -66,7 +66,7 @@ func NewRuleManager(storage endpoint.RuleStorage, storeSetInformer core.StoreSet // Initialize loads rules from storage. If Placement Rules feature is never enabled, it creates default rule that is // compatible with previous configuration. -func (m *RuleManager) Initialize(maxReplica int, locationLabels []string) error { +func (m *RuleManager) Initialize(maxReplica int, locationLabels []string, isolationLevel string) error { m.Lock() defer m.Unlock() if m.initialized { @@ -93,6 +93,7 @@ func (m *RuleManager) Initialize(maxReplica int, locationLabels []string) error Role: Voter, Count: maxReplica - witnessCount, LocationLabels: locationLabels, + IsolationLevel: isolationLevel, }, { GroupID: "pd", @@ -101,6 +102,7 @@ func (m *RuleManager) Initialize(maxReplica int, locationLabels []string) error Count: witnessCount, IsWitness: true, LocationLabels: locationLabels, + IsolationLevel: isolationLevel, }, }..., ) @@ -111,6 +113,7 @@ func (m *RuleManager) Initialize(maxReplica int, locationLabels []string) error Role: Voter, Count: maxReplica, LocationLabels: locationLabels, + IsolationLevel: isolationLevel, }) } for _, defaultRule := range defaultRules { diff --git a/pkg/schedule/placement/rule_manager_test.go b/pkg/schedule/placement/rule_manager_test.go index e5be8d74cd2..a6454337aa8 100644 --- a/pkg/schedule/placement/rule_manager_test.go +++ b/pkg/schedule/placement/rule_manager_test.go @@ -34,7 +34,7 @@ func newTestManager(t *testing.T, enableWitness bool) (endpoint.RuleStorage, *Ru var err error manager := NewRuleManager(store, nil, mockconfig.NewTestOptions()) manager.conf.SetEnableWitness(enableWitness) - err = manager.Initialize(3, []string{"zone", "rack", "host"}) + err = manager.Initialize(3, []string{"zone", "rack", "host"}, "") re.NoError(err) return store, manager } @@ -157,7 +157,7 @@ func TestSaveLoad(t *testing.T) { } m2 := NewRuleManager(store, nil, nil) - err := m2.Initialize(3, []string{"no", "labels"}) + err := m2.Initialize(3, []string{"no", "labels"}, "") re.NoError(err) re.Len(m2.GetAllRules(), 3) re.Equal(rules[0].String(), m2.GetRule("pd", "default").String()) @@ -173,7 +173,7 @@ func TestSetAfterGet(t *testing.T) { manager.SetRule(rule) m2 := NewRuleManager(store, nil, nil) - err := m2.Initialize(100, []string{}) + err := m2.Initialize(100, []string{}, "") re.NoError(err) rule = m2.GetRule("pd", "default") re.Equal(1, rule.Count) diff --git a/pkg/statistics/region_collection_test.go b/pkg/statistics/region_collection_test.go index 232fb8b73d8..2706ffeb043 100644 --- a/pkg/statistics/region_collection_test.go +++ b/pkg/statistics/region_collection_test.go @@ -30,7 +30,7 @@ func TestRegionStatistics(t *testing.T) { re := require.New(t) store := storage.NewStorageWithMemoryBackend() manager := placement.NewRuleManager(store, nil, nil) - err := manager.Initialize(3, []string{"zone", "rack", "host"}) + err := manager.Initialize(3, []string{"zone", "rack", "host"}, "") re.NoError(err) opt := mockconfig.NewTestOptions() opt.SetPlacementRuleEnabled(false) @@ -120,7 +120,7 @@ func TestRegionStatisticsWithPlacementRule(t *testing.T) { re := require.New(t) store := storage.NewStorageWithMemoryBackend() manager := placement.NewRuleManager(store, nil, nil) - err := manager.Initialize(3, []string{"zone", "rack", "host"}) + err := manager.Initialize(3, []string{"zone", "rack", "host"}, "") re.NoError(err) opt := mockconfig.NewTestOptions() opt.SetPlacementRuleEnabled(true) diff --git a/server/api/operator_test.go b/server/api/operator_test.go index ddb605c7d87..ee849552f09 100644 --- a/server/api/operator_test.go +++ b/server/api/operator_test.go @@ -383,7 +383,9 @@ func (suite *transferRegionOperatorTestSuite) TestTransferRegionWithPlacementRul if testCase.placementRuleEnable { err := suite.svr.GetRaftCluster().GetRuleManager().Initialize( suite.svr.GetRaftCluster().GetOpts().GetMaxReplicas(), - suite.svr.GetRaftCluster().GetOpts().GetLocationLabels()) + suite.svr.GetRaftCluster().GetOpts().GetLocationLabels(), + suite.svr.GetRaftCluster().GetOpts().GetIsolationLevel(), + ) suite.NoError(err) } if len(testCase.rules) > 0 { diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 771fb03ac20..d42dbb21ed1 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -301,7 +301,7 @@ func (c *RaftCluster) Start(s Server) error { c.ruleManager = placement.NewRuleManager(c.storage, c, c.GetOpts()) if c.opt.IsPlacementRulesEnabled() { - err = c.ruleManager.Initialize(c.opt.GetMaxReplicas(), c.opt.GetLocationLabels()) + err = c.ruleManager.Initialize(c.opt.GetMaxReplicas(), c.opt.GetLocationLabels(), c.opt.GetIsolationLevel()) if err != nil { return err } diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index c9d4d0f8f61..aa826e34406 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -243,7 +243,7 @@ func TestSetOfflineStore(t *testing.T) { cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) cluster.ruleManager = placement.NewRuleManager(storage.NewStorageWithMemoryBackend(), cluster, cluster.GetOpts()) if opt.IsPlacementRulesEnabled() { - err := cluster.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels()) + err := cluster.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels(), opt.GetIsolationLevel()) if err != nil { panic(err) } @@ -440,7 +440,7 @@ func TestUpStore(t *testing.T) { cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) cluster.ruleManager = placement.NewRuleManager(storage.NewStorageWithMemoryBackend(), cluster, cluster.GetOpts()) if opt.IsPlacementRulesEnabled() { - err := cluster.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels()) + err := cluster.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels(), opt.GetIsolationLevel()) if err != nil { panic(err) } @@ -543,7 +543,7 @@ func TestDeleteStoreUpdatesClusterVersion(t *testing.T) { cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) cluster.ruleManager = placement.NewRuleManager(storage.NewStorageWithMemoryBackend(), cluster, cluster.GetOpts()) if opt.IsPlacementRulesEnabled() { - err := cluster.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels()) + err := cluster.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels(), opt.GetIsolationLevel()) if err != nil { panic(err) } @@ -1270,7 +1270,7 @@ func TestOfflineAndMerge(t *testing.T) { cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) cluster.ruleManager = placement.NewRuleManager(storage.NewStorageWithMemoryBackend(), cluster, cluster.GetOpts()) if opt.IsPlacementRulesEnabled() { - err := cluster.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels()) + err := cluster.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels(), opt.GetIsolationLevel()) if err != nil { panic(err) } @@ -2129,7 +2129,7 @@ func newTestRaftCluster( rc.InitCluster(id, opt, s, basicCluster, nil) rc.ruleManager = placement.NewRuleManager(storage.NewStorageWithMemoryBackend(), rc, opt) if opt.IsPlacementRulesEnabled() { - err := rc.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels()) + err := rc.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels(), opt.GetIsolationLevel()) if err != nil { panic(err) } diff --git a/server/config/persist_options.go b/server/config/persist_options.go index 3f1c4d4a24e..14fdbf653aa 100644 --- a/server/config/persist_options.go +++ b/server/config/persist_options.go @@ -330,6 +330,13 @@ func (o *PersistOptions) SetEnableWitness(enable bool) { o.SetScheduleConfig(v) } +// SetMaxStoreDownTime to set the max store down time. It's only used to test. +func (o *PersistOptions) SetMaxStoreDownTime(time time.Duration) { + v := o.GetScheduleConfig().Clone() + v.MaxStoreDownTime = typeutil.NewDuration(time) + o.SetScheduleConfig(v) +} + // SetMaxMergeRegionSize sets the max merge region size. func (o *PersistOptions) SetMaxMergeRegionSize(maxMergeRegionSize uint64) { v := o.GetScheduleConfig().Clone() diff --git a/server/server.go b/server/server.go index 03e036a968e..2fb66387d7a 100644 --- a/server/server.go +++ b/server/server.go @@ -1030,7 +1030,7 @@ func (s *Server) SetReplicationConfig(cfg sc.ReplicationConfig) error { } if cfg.EnablePlacementRules { // initialize rule manager. - if err := rc.GetRuleManager().Initialize(int(cfg.MaxReplicas), cfg.LocationLabels); err != nil { + if err := rc.GetRuleManager().Initialize(int(cfg.MaxReplicas), cfg.LocationLabels, cfg.IsolationLevel); err != nil { return err } } else { @@ -1053,19 +1053,19 @@ func (s *Server) SetReplicationConfig(cfg sc.ReplicationConfig) error { defaultRule := rc.GetRuleManager().GetRule("pd", "default") CheckInDefaultRule := func() error { - // replication config won't work when placement rule is enabled and exceeds one default rule + // replication config won't work when placement rule is enabled and exceeds one default rule if !(defaultRule != nil && len(defaultRule.StartKey) == 0 && len(defaultRule.EndKey) == 0) { - return errors.New("cannot update MaxReplicas or LocationLabels when placement rules feature is enabled and not only default rule exists, please update rule instead") + return errors.New("cannot update MaxReplicas, LocationLabels or IsolationLevel when placement rules feature is enabled and not only default rule exists, please update rule instead") } - if !(defaultRule.Count == int(old.MaxReplicas) && typeutil.AreStringSlicesEqual(defaultRule.LocationLabels, []string(old.LocationLabels))) { + if !(defaultRule.Count == int(old.MaxReplicas) && typeutil.AreStringSlicesEqual(defaultRule.LocationLabels, []string(old.LocationLabels)) && defaultRule.IsolationLevel == old.IsolationLevel) { return errors.New("cannot to update replication config, the default rules do not consistent with replication config, please update rule instead") } return nil } - if !(cfg.MaxReplicas == old.MaxReplicas && typeutil.AreStringSlicesEqual(cfg.LocationLabels, old.LocationLabels)) { + if !(cfg.MaxReplicas == old.MaxReplicas && typeutil.AreStringSlicesEqual(cfg.LocationLabels, old.LocationLabels) && cfg.IsolationLevel == old.IsolationLevel) { if err := CheckInDefaultRule(); err != nil { return err } @@ -1076,6 +1076,7 @@ func (s *Server) SetReplicationConfig(cfg sc.ReplicationConfig) error { if rule != nil { rule.Count = int(cfg.MaxReplicas) rule.LocationLabels = cfg.LocationLabels + rule.IsolationLevel = cfg.IsolationLevel rc := s.GetRaftCluster() if rc == nil { return errs.ErrNotBootstrapped.GenWithStackByArgs() diff --git a/tests/pdctl/config/config_test.go b/tests/pdctl/config/config_test.go index 3d0146589d5..f43a964b50c 100644 --- a/tests/pdctl/config/config_test.go +++ b/tests/pdctl/config/config_test.go @@ -683,7 +683,7 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { re.Equal(expect, replicationCfg.MaxReplicas) } - checkLocaltionLabels := func(expect int) { + checkLocationLabels := func(expect int) { args := []string{"-u", pdAddr, "config", "show", "replication"} output, err := pdctl.ExecuteCommand(cmd, args...) re.NoError(err) @@ -692,6 +692,15 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { re.Len(replicationCfg.LocationLabels, expect) } + checkIsolationLevel := func(expect string) { + args := []string{"-u", pdAddr, "config", "show", "replication"} + output, err := pdctl.ExecuteCommand(cmd, args...) + re.NoError(err) + replicationCfg := sc.ReplicationConfig{} + re.NoError(json.Unmarshal(output, &replicationCfg)) + re.Equal(replicationCfg.IsolationLevel, expect) + } + checkRuleCount := func(expect int) { args := []string{"-u", pdAddr, "config", "placement-rules", "show", "--group", "pd", "--id", "default"} output, err := pdctl.ExecuteCommand(cmd, args...) @@ -710,6 +719,15 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { re.Len(rule.LocationLabels, expect) } + checkRuleIsolationLevel := func(expect string) { + args := []string{"-u", pdAddr, "config", "placement-rules", "show", "--group", "pd", "--id", "default"} + output, err := pdctl.ExecuteCommand(cmd, args...) + re.NoError(err) + rule := placement.Rule{} + re.NoError(json.Unmarshal(output, &rule)) + re.Equal(rule.IsolationLevel, expect) + } + // update successfully when placement rules is not enabled. output, err := pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "set", "max-replicas", "2") re.NoError(err) @@ -718,8 +736,13 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { output, err = pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "set", "location-labels", "zone,host") re.NoError(err) re.Contains(string(output), "Success!") - checkLocaltionLabels(2) + output, err = pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "set", "isolation-level", "zone") + re.NoError(err) + re.Contains(string(output), "Success!") + checkLocationLabels(2) checkRuleLocationLabels(2) + checkIsolationLevel("zone") + checkRuleIsolationLevel("zone") // update successfully when only one default rule exists. output, err = pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "placement-rules", "enable") @@ -732,11 +755,18 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { checkMaxReplicas(3) checkRuleCount(3) + // We need to change isolation first because we will validate + // if the location label contains the isolation level when setting location labels. + output, err = pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "set", "isolation-level", "host") + re.NoError(err) + re.Contains(string(output), "Success!") output, err = pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "set", "location-labels", "host") re.NoError(err) re.Contains(string(output), "Success!") - checkLocaltionLabels(1) + checkLocationLabels(1) checkRuleLocationLabels(1) + checkIsolationLevel("host") + checkRuleIsolationLevel("host") // update unsuccessfully when many rule exists. fname := t.TempDir() @@ -760,8 +790,10 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { re.NoError(err) checkMaxReplicas(4) checkRuleCount(4) - checkLocaltionLabels(1) + checkLocationLabels(1) checkRuleLocationLabels(1) + checkIsolationLevel("host") + checkRuleIsolationLevel("host") } func TestPDServerConfig(t *testing.T) { From e2f12696c76adc96d43297bcc2f5df097ed21b70 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Thu, 21 Sep 2023 16:55:46 +0800 Subject: [PATCH 11/14] util: add check delete json function (#7113) ref tikv/pd#4399 Signed-off-by: lhy1024 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- errors.toml | 30 ++++++++++++------------- pkg/autoscaling/prometheus_test.go | 4 ++-- pkg/errs/errno.go | 6 ++--- pkg/tso/keyspace_group_manager.go | 14 +++++++----- pkg/utils/apiutil/apiutil.go | 11 +++------ pkg/utils/testutil/api_check.go | 11 ++++++++- server/api/admin_test.go | 7 ++---- server/api/diagnostic_test.go | 3 +-- server/api/operator_test.go | 9 ++++---- server/api/region_label_test.go | 3 +-- server/api/rule_test.go | 12 +++++----- server/api/scheduler.go | 7 +++--- server/api/scheduler_test.go | 27 +++++++++------------- server/api/service_gc_safepoint_test.go | 4 +--- 14 files changed, 71 insertions(+), 77 deletions(-) diff --git a/errors.toml b/errors.toml index 6766da79572..1b96de8a209 100644 --- a/errors.toml +++ b/errors.toml @@ -531,21 +531,6 @@ error = ''' plugin is not found: %s ''' -["PD:operator:ErrRegionAbnormalPeer"] -error = ''' -region %v has abnormal peer -''' - -["PD:operator:ErrRegionNotAdjacent"] -error = ''' -two regions are not adjacent -''' - -["PD:operator:ErrRegionNotFound"] -error = ''' -region %v not found -''' - ["PD:os:ErrOSOpen"] error = ''' open error @@ -616,6 +601,21 @@ error = ''' failed to unmarshal proto ''' +["PD:region:ErrRegionAbnormalPeer"] +error = ''' +region %v has abnormal peer +''' + +["PD:region:ErrRegionNotAdjacent"] +error = ''' +two regions are not adjacent +''' + +["PD:region:ErrRegionNotFound"] +error = ''' +region %v not found +''' + ["PD:region:ErrRegionRuleContent"] error = ''' invalid region rule content, %s diff --git a/pkg/autoscaling/prometheus_test.go b/pkg/autoscaling/prometheus_test.go index 6d4a27b0411..6c30e3ead4c 100644 --- a/pkg/autoscaling/prometheus_test.go +++ b/pkg/autoscaling/prometheus_test.go @@ -155,7 +155,7 @@ func makeJSONResponse(promResp *response) (*http.Response, []byte, error) { response := &http.Response{ Status: "200 OK", - StatusCode: 200, + StatusCode: http.StatusOK, Proto: "HTTP/1.1", ProtoMajor: 1, ProtoMinor: 1, @@ -246,7 +246,7 @@ func (c *errorHTTPStatusClient) Do(_ context.Context, req *http.Request) (r *htt r, body, err = makeJSONResponse(promResp) - r.StatusCode = 500 + r.StatusCode = http.StatusInternalServerError r.Status = "500 Internal Server Error" return diff --git a/pkg/errs/errno.go b/pkg/errs/errno.go index 9eedb144f95..181dfc9b393 100644 --- a/pkg/errs/errno.go +++ b/pkg/errs/errno.go @@ -103,11 +103,11 @@ var ( // region errors var ( // ErrRegionNotAdjacent is error info for region not adjacent. - ErrRegionNotAdjacent = errors.Normalize("two regions are not adjacent", errors.RFCCodeText("PD:operator:ErrRegionNotAdjacent")) + ErrRegionNotAdjacent = errors.Normalize("two regions are not adjacent", errors.RFCCodeText("PD:region:ErrRegionNotAdjacent")) // ErrRegionNotFound is error info for region not found. - ErrRegionNotFound = errors.Normalize("region %v not found", errors.RFCCodeText("PD:operator:ErrRegionNotFound")) + ErrRegionNotFound = errors.Normalize("region %v not found", errors.RFCCodeText("PD:region:ErrRegionNotFound")) // ErrRegionAbnormalPeer is error info for region has abnormal peer. - ErrRegionAbnormalPeer = errors.Normalize("region %v has abnormal peer", errors.RFCCodeText("PD:operator:ErrRegionAbnormalPeer")) + ErrRegionAbnormalPeer = errors.Normalize("region %v has abnormal peer", errors.RFCCodeText("PD:region:ErrRegionAbnormalPeer")) ) // plugin errors diff --git a/pkg/tso/keyspace_group_manager.go b/pkg/tso/keyspace_group_manager.go index c6d2323aa4b..3b352884eab 100644 --- a/pkg/tso/keyspace_group_manager.go +++ b/pkg/tso/keyspace_group_manager.go @@ -1226,16 +1226,17 @@ func (kgm *KeyspaceGroupManager) finishSplitKeyspaceGroup(id uint32) error { return nil } startRequest := time.Now() - statusCode, err := apiutil.DoDelete( + resp, err := apiutil.DoDelete( kgm.httpClient, kgm.cfg.GeBackendEndpoints()+keyspaceGroupsAPIPrefix+fmt.Sprintf("/%d/split", id)) if err != nil { return err } - if statusCode != http.StatusOK { + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { log.Warn("failed to finish split keyspace group", zap.Uint32("keyspace-group-id", id), - zap.Int("status-code", statusCode)) + zap.Int("status-code", resp.StatusCode)) return errs.ErrSendRequest.FastGenByArgs() } kgm.metrics.finishSplitSendDuration.Observe(time.Since(startRequest).Seconds()) @@ -1264,16 +1265,17 @@ func (kgm *KeyspaceGroupManager) finishMergeKeyspaceGroup(id uint32) error { return nil } startRequest := time.Now() - statusCode, err := apiutil.DoDelete( + resp, err := apiutil.DoDelete( kgm.httpClient, kgm.cfg.GeBackendEndpoints()+keyspaceGroupsAPIPrefix+fmt.Sprintf("/%d/merge", id)) if err != nil { return err } - if statusCode != http.StatusOK { + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { log.Warn("failed to finish merging keyspace group", zap.Uint32("keyspace-group-id", id), - zap.Int("status-code", statusCode)) + zap.Int("status-code", resp.StatusCode)) return errs.ErrSendRequest.FastGenByArgs() } kgm.metrics.finishMergeSendDuration.Observe(time.Since(startRequest).Seconds()) diff --git a/pkg/utils/apiutil/apiutil.go b/pkg/utils/apiutil/apiutil.go index 0b72b9af10f..2c476042da0 100644 --- a/pkg/utils/apiutil/apiutil.go +++ b/pkg/utils/apiutil/apiutil.go @@ -226,17 +226,12 @@ func PostJSONIgnoreResp(client *http.Client, url string, data []byte) error { } // DoDelete is used to send delete request and return http response code. -func DoDelete(client *http.Client, url string) (int, error) { +func DoDelete(client *http.Client, url string) (*http.Response, error) { req, err := http.NewRequest(http.MethodDelete, url, nil) if err != nil { - return http.StatusBadRequest, err - } - res, err := client.Do(req) - if err != nil { - return 0, err + return nil, err } - defer res.Body.Close() - return res.StatusCode, nil + return client.Do(req) } func checkResponse(resp *http.Response, err error) error { diff --git a/pkg/utils/testutil/api_check.go b/pkg/utils/testutil/api_check.go index d11d575967d..84af97f828d 100644 --- a/pkg/utils/testutil/api_check.go +++ b/pkg/utils/testutil/api_check.go @@ -123,9 +123,18 @@ func CheckPatchJSON(client *http.Client, url string, data []byte, checkOpts ...f return checkResp(resp, checkOpts...) } +// CheckDelete is used to do delete request and do check options. +func CheckDelete(client *http.Client, url string, checkOpts ...func([]byte, int, http.Header)) error { + resp, err := apiutil.DoDelete(client, url) + if err != nil { + return err + } + return checkResp(resp, checkOpts...) +} + func checkResp(resp *http.Response, checkOpts ...func([]byte, int, http.Header)) error { res, err := io.ReadAll(resp.Body) - resp.Body.Close() + defer resp.Body.Close() if err != nil { return err } diff --git a/server/api/admin_test.go b/server/api/admin_test.go index 1f2b386eb98..6a972171e1f 100644 --- a/server/api/admin_test.go +++ b/server/api/admin_test.go @@ -26,7 +26,6 @@ import ( "github.com/pingcap/kvproto/pkg/pdpb" "github.com/stretchr/testify/suite" "github.com/tikv/pd/pkg/core" - "github.com/tikv/pd/pkg/utils/apiutil" tu "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/server" ) @@ -271,9 +270,8 @@ func (suite *adminTestSuite) TestMarkSnapshotRecovering() { suite.NoError(err2) suite.True(resp.Marked) // unmark - code, err := apiutil.DoDelete(testDialClient, url) + err := tu.CheckDelete(testDialClient, url, tu.StatusOK(re)) suite.NoError(err) - suite.Equal(200, code) suite.NoError(tu.CheckGetJSON(testDialClient, url, nil, tu.StatusOK(re), tu.StringContain(re, "false"))) } @@ -310,9 +308,8 @@ func (suite *adminTestSuite) TestRecoverAllocID() { suite.NoError(err2) suite.Equal(id, uint64(99000001)) // unmark - code, err := apiutil.DoDelete(testDialClient, markRecoveringURL) + err := tu.CheckDelete(testDialClient, markRecoveringURL, tu.StatusOK(re)) suite.NoError(err) - suite.Equal(200, code) suite.NoError(tu.CheckGetJSON(testDialClient, markRecoveringURL, nil, tu.StatusOK(re), tu.StringContain(re, "false"))) suite.NoError(tu.CheckPostJSON(testDialClient, url, []byte(`{"id": "100000"}`), diff --git a/server/api/diagnostic_test.go b/server/api/diagnostic_test.go index 8a39b2e0007..1774c221539 100644 --- a/server/api/diagnostic_test.go +++ b/server/api/diagnostic_test.go @@ -24,7 +24,6 @@ import ( "github.com/stretchr/testify/suite" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/schedulers" - "github.com/tikv/pd/pkg/utils/apiutil" tu "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/server" "github.com/tikv/pd/server/config" @@ -129,7 +128,7 @@ func (suite *diagnosticTestSuite) TestSchedulerDiagnosticAPI() { suite.checkStatus("normal", balanceRegionURL) deleteURL := fmt.Sprintf("%s/%s", suite.schedulerPrifex, schedulers.BalanceRegionName) - _, err = apiutil.DoDelete(testDialClient, deleteURL) + err = tu.CheckDelete(testDialClient, deleteURL, tu.StatusOK(re)) suite.NoError(err) suite.checkStatus("disabled", balanceRegionURL) } diff --git a/server/api/operator_test.go b/server/api/operator_test.go index ee849552f09..1675fdd40c7 100644 --- a/server/api/operator_test.go +++ b/server/api/operator_test.go @@ -33,7 +33,6 @@ import ( "github.com/tikv/pd/pkg/mock/mockhbstream" pdoperator "github.com/tikv/pd/pkg/schedule/operator" "github.com/tikv/pd/pkg/schedule/placement" - "github.com/tikv/pd/pkg/utils/apiutil" tu "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/pkg/versioninfo" "github.com/tikv/pd/server" @@ -99,7 +98,7 @@ func (suite *operatorTestSuite) TestAddRemovePeer() { suite.Contains(operator, "add learner peer 1 on store 3") suite.Contains(operator, "RUNNING") - _, err = apiutil.DoDelete(testDialClient, regionURL) + err = tu.CheckDelete(testDialClient, regionURL, tu.StatusOK(re)) suite.NoError(err) records = mustReadURL(re, recordURL) suite.Contains(records, "admin-add-peer {add peer: store [3]}") @@ -110,7 +109,7 @@ func (suite *operatorTestSuite) TestAddRemovePeer() { suite.Contains(operator, "RUNNING") suite.Contains(operator, "remove peer on store 2") - _, err = apiutil.DoDelete(testDialClient, regionURL) + err = tu.CheckDelete(testDialClient, regionURL, tu.StatusOK(re)) suite.NoError(err) records = mustReadURL(re, recordURL) suite.Contains(records, "admin-remove-peer {rm peer: store [2]}") @@ -406,8 +405,10 @@ func (suite *transferRegionOperatorTestSuite) TestTransferRegionWithPlacementRul if len(testCase.expectSteps) > 0 { operator = mustReadURL(re, regionURL) suite.Contains(operator, testCase.expectSteps) + err = tu.CheckDelete(testDialClient, regionURL, tu.StatusOK(re)) + } else { + err = tu.CheckDelete(testDialClient, regionURL, tu.StatusNotOK(re)) } - _, err = apiutil.DoDelete(testDialClient, regionURL) suite.NoError(err) } } diff --git a/server/api/region_label_test.go b/server/api/region_label_test.go index 021ec7f1359..fd7401b83e0 100644 --- a/server/api/region_label_test.go +++ b/server/api/region_label_test.go @@ -24,7 +24,6 @@ import ( "github.com/pingcap/failpoint" "github.com/stretchr/testify/suite" "github.com/tikv/pd/pkg/schedule/labeler" - "github.com/tikv/pd/pkg/utils/apiutil" tu "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/server" ) @@ -86,7 +85,7 @@ func (suite *regionLabelTestSuite) TestGetSet() { expects := []*labeler.LabelRule{rules[0], rules[2]} suite.Equal(expects, resp) - _, err = apiutil.DoDelete(testDialClient, suite.urlPrefix+"rule/"+url.QueryEscape("rule2/a/b")) + err = tu.CheckDelete(testDialClient, suite.urlPrefix+"rule/"+url.QueryEscape("rule2/a/b"), tu.StatusOK(re)) suite.NoError(err) err = tu.ReadGetJSON(re, testDialClient, suite.urlPrefix+"rules", &resp) suite.NoError(err) diff --git a/server/api/rule_test.go b/server/api/rule_test.go index 4cea1523401..d2dc50f1119 100644 --- a/server/api/rule_test.go +++ b/server/api/rule_test.go @@ -26,7 +26,6 @@ import ( "github.com/stretchr/testify/suite" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/placement" - "github.com/tikv/pd/pkg/utils/apiutil" tu "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/server" "github.com/tikv/pd/server/config" @@ -202,13 +201,13 @@ func (suite *ruleTestSuite) TestGet() { name: "found", rule: rule, found: true, - code: 200, + code: http.StatusOK, }, { name: "not found", rule: placement.Rule{GroupID: "a", ID: "30", StartKeyHex: "1111", EndKeyHex: "3333", Role: "voter", Count: 1}, found: false, - code: 404, + code: http.StatusNotFound, }, } for _, testCase := range testCases { @@ -533,9 +532,8 @@ func (suite *ruleTestSuite) TestDelete() { url := fmt.Sprintf("%s/rule/%s/%s", suite.urlPrefix, testCase.groupID, testCase.id) // clear suspect keyRanges to prevent test case from others suite.svr.GetRaftCluster().ClearSuspectKeyRanges() - statusCode, err := apiutil.DoDelete(testDialClient, url) + err = tu.CheckDelete(testDialClient, url, tu.StatusOK(suite.Require())) suite.NoError(err) - suite.Equal(http.StatusOK, statusCode) if len(testCase.popKeyRange) > 0 { popKeyRangeMap := map[string]struct{}{} for i := 0; i < len(testCase.popKeyRange)/2; i++ { @@ -726,7 +724,7 @@ func (suite *ruleTestSuite) TestBundle() { suite.compareBundle(bundles[1], b2) // Delete - _, err = apiutil.DoDelete(testDialClient, suite.urlPrefix+"/placement-rule/pd") + err = tu.CheckDelete(testDialClient, suite.urlPrefix+"/placement-rule/pd", tu.StatusOK(suite.Require())) suite.NoError(err) // GetAll again @@ -753,7 +751,7 @@ func (suite *ruleTestSuite) TestBundle() { suite.compareBundle(bundles[2], b3) // Delete using regexp - _, err = apiutil.DoDelete(testDialClient, suite.urlPrefix+"/placement-rule/"+url.PathEscape("foo.*")+"?regexp") + err = tu.CheckDelete(testDialClient, suite.urlPrefix+"/placement-rule/"+url.PathEscape("foo.*")+"?regexp", tu.StatusOK(suite.Require())) suite.NoError(err) // GetAll again diff --git a/server/api/scheduler.go b/server/api/scheduler.go index dc7f2507141..c2691ea9826 100644 --- a/server/api/scheduler.go +++ b/server/api/scheduler.go @@ -324,12 +324,13 @@ func (h *schedulerHandler) redirectSchedulerDelete(w http.ResponseWriter, name, h.r.JSON(w, http.StatusInternalServerError, err.Error()) return } - statusCode, err := apiutil.DoDelete(h.svr.GetHTTPClient(), deleteURL) + resp, err := apiutil.DoDelete(h.svr.GetHTTPClient(), deleteURL) if err != nil { - h.r.JSON(w, statusCode, err.Error()) + h.r.JSON(w, resp.StatusCode, err.Error()) return } - h.r.JSON(w, statusCode, nil) + defer resp.Body.Close() + h.r.JSON(w, resp.StatusCode, nil) } // FIXME: details of input json body params diff --git a/server/api/scheduler_test.go b/server/api/scheduler_test.go index 613de8e441c..b015bbe8f52 100644 --- a/server/api/scheduler_test.go +++ b/server/api/scheduler_test.go @@ -25,7 +25,6 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/stretchr/testify/suite" sc "github.com/tikv/pd/pkg/schedule/config" - "github.com/tikv/pd/pkg/utils/apiutil" tu "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/server" ) @@ -93,7 +92,7 @@ func (suite *scheduleTestSuite) TestOriginAPI() { suite.NoError(tu.ReadGetJSON(re, testDialClient, listURL, &resp)) suite.Len(resp["store-id-ranges"], 2) deleteURL := fmt.Sprintf("%s/%s", suite.urlPrefix, "evict-leader-scheduler-1") - _, err = apiutil.DoDelete(testDialClient, deleteURL) + err = tu.CheckDelete(testDialClient, deleteURL, tu.StatusOK(re)) suite.NoError(err) suite.Len(rc.GetSchedulers(), 1) resp1 := make(map[string]interface{}) @@ -101,18 +100,16 @@ func (suite *scheduleTestSuite) TestOriginAPI() { suite.Len(resp1["store-id-ranges"], 1) deleteURL = fmt.Sprintf("%s/%s", suite.urlPrefix, "evict-leader-scheduler-2") suite.NoError(failpoint.Enable("github.com/tikv/pd/server/config/persistFail", "return(true)")) - statusCode, err := apiutil.DoDelete(testDialClient, deleteURL) + err = tu.CheckDelete(testDialClient, deleteURL, tu.Status(re, http.StatusInternalServerError)) suite.NoError(err) - suite.Equal(500, statusCode) suite.Len(rc.GetSchedulers(), 1) suite.NoError(failpoint.Disable("github.com/tikv/pd/server/config/persistFail")) - statusCode, err = apiutil.DoDelete(testDialClient, deleteURL) + err = tu.CheckDelete(testDialClient, deleteURL, tu.StatusOK(re)) suite.NoError(err) - suite.Equal(200, statusCode) suite.Empty(rc.GetSchedulers()) - suite.NoError(tu.CheckGetJSON(testDialClient, listURL, nil, tu.Status(re, 404))) - statusCode, _ = apiutil.DoDelete(testDialClient, deleteURL) - suite.Equal(404, statusCode) + suite.NoError(tu.CheckGetJSON(testDialClient, listURL, nil, tu.Status(re, http.StatusNotFound))) + err = tu.CheckDelete(testDialClient, deleteURL, tu.Status(re, http.StatusNotFound)) + suite.NoError(err) } func (suite *scheduleTestSuite) TestAPI() { @@ -370,15 +367,14 @@ func (suite *scheduleTestSuite) TestAPI() { // using /pd/v1/schedule-config/grant-leader-scheduler/config to delete exists store from grant-leader-scheduler deleteURL := fmt.Sprintf("%s%s%s/%s/delete/%s", suite.svr.GetAddr(), apiPrefix, server.SchedulerConfigHandlerPath, name, "2") - _, err = apiutil.DoDelete(testDialClient, deleteURL) + err = tu.CheckDelete(testDialClient, deleteURL, tu.StatusOK(re)) suite.NoError(err) resp = make(map[string]interface{}) suite.NoError(tu.ReadGetJSON(re, testDialClient, listURL, &resp)) delete(exceptMap, "2") suite.Equal(exceptMap, resp["store-id-ranges"]) - statusCode, err := apiutil.DoDelete(testDialClient, deleteURL) + err = tu.CheckDelete(testDialClient, deleteURL, tu.Status(re, http.StatusNotFound)) suite.NoError(err) - suite.Equal(404, statusCode) }, }, { @@ -434,15 +430,14 @@ func (suite *scheduleTestSuite) TestAPI() { // using /pd/v1/schedule-config/evict-leader-scheduler/config to delete exist store from evict-leader-scheduler deleteURL := fmt.Sprintf("%s%s%s/%s/delete/%s", suite.svr.GetAddr(), apiPrefix, server.SchedulerConfigHandlerPath, name, "4") - _, err = apiutil.DoDelete(testDialClient, deleteURL) + err = tu.CheckDelete(testDialClient, deleteURL, tu.StatusOK(re)) suite.NoError(err) resp = make(map[string]interface{}) suite.NoError(tu.ReadGetJSON(re, testDialClient, listURL, &resp)) delete(exceptMap, "4") suite.Equal(exceptMap, resp["store-id-ranges"]) - statusCode, err := apiutil.DoDelete(testDialClient, deleteURL) + err = tu.CheckDelete(testDialClient, deleteURL, tu.Status(re, http.StatusNotFound)) suite.NoError(err) - suite.Equal(404, statusCode) }, }, } @@ -591,7 +586,7 @@ func (suite *scheduleTestSuite) addScheduler(body []byte) { func (suite *scheduleTestSuite) deleteScheduler(createdName string) { deleteURL := fmt.Sprintf("%s/%s", suite.urlPrefix, createdName) - _, err := apiutil.DoDelete(testDialClient, deleteURL) + err := tu.CheckDelete(testDialClient, deleteURL, tu.StatusOK(suite.Require())) suite.NoError(err) } diff --git a/server/api/service_gc_safepoint_test.go b/server/api/service_gc_safepoint_test.go index fe52204dfb2..517a94c2e23 100644 --- a/server/api/service_gc_safepoint_test.go +++ b/server/api/service_gc_safepoint_test.go @@ -16,7 +16,6 @@ package api import ( "fmt" - "net/http" "testing" "time" @@ -93,9 +92,8 @@ func (suite *serviceGCSafepointTestSuite) TestServiceGCSafepoint() { suite.NoError(err) suite.Equal(list, listResp) - statusCode, err := apiutil.DoDelete(testDialClient, sspURL+"/a") + err = testutil.CheckDelete(testDialClient, sspURL+"/a", testutil.StatusOK(suite.Require())) suite.NoError(err) - suite.Equal(http.StatusOK, statusCode) left, err := storage.LoadAllServiceGCSafePoints() suite.NoError(err) From 96ace89decdc0b5e0a050aa17ba4356057ec3b88 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Thu, 21 Sep 2023 17:12:15 +0800 Subject: [PATCH 12/14] tests: refactor and make pd-ctl helper support mcs (#7120) ref tikv/pd#5839 Signed-off-by: lhy1024 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- tests/autoscaling/autoscaling_test.go | 2 +- tests/cluster.go | 20 ++++++ tests/compatibility/version_upgrade_test.go | 6 +- tests/dashboard/service_test.go | 2 +- tests/integrations/client/client_test.go | 12 ++-- .../mcs/keyspace/tso_keyspace_group_test.go | 2 +- .../resourcemanager/resource_manager_test.go | 4 +- tests/integrations/mcs/scheduling/api_test.go | 2 +- .../mcs/tso/keyspace_group_manager_test.go | 4 +- tests/integrations/tso/client_test.go | 2 +- tests/pdctl/cluster/cluster_test.go | 2 +- tests/pdctl/config/config_test.go | 36 ++++------ tests/pdctl/health/health_test.go | 2 +- tests/pdctl/helper.go | 64 ----------------- tests/pdctl/hot/hot_test.go | 36 +++++----- tests/pdctl/keyspace/keyspace_group_test.go | 16 ++--- tests/pdctl/keyspace/keyspace_test.go | 2 +- tests/pdctl/label/label_test.go | 4 +- tests/pdctl/log/log_test.go | 9 +-- tests/pdctl/member/member_test.go | 2 +- tests/pdctl/operator/operator_test.go | 10 +-- tests/pdctl/region/region_test.go | 18 ++--- tests/pdctl/scheduler/scheduler_test.go | 16 ++--- tests/pdctl/store/store_test.go | 14 ++-- tests/pdctl/unsafe/unsafe_operation_test.go | 2 +- tests/registry/registry_test.go | 4 +- tests/server/api/api_test.go | 61 ++++++++-------- tests/server/apiv2/handlers/keyspace_test.go | 2 +- .../apiv2/handlers/tso_keyspace_group_test.go | 2 +- tests/server/cluster/cluster_test.go | 42 +++++------ tests/server/cluster/cluster_work_test.go | 6 +- tests/server/config/config_test.go | 4 +- tests/server/id/id_test.go | 12 ++-- tests/server/keyspace/keyspace_test.go | 2 +- tests/server/member/member_test.go | 2 +- .../region_syncer/region_syncer_test.go | 10 +-- .../server/storage/hot_region_storage_test.go | 29 ++++---- tests/server/tso/consistency_test.go | 10 +-- tests/server/tso/global_tso_test.go | 4 +- tests/server/tso/tso_test.go | 4 +- tests/server/watch/leader_watch_test.go | 4 +- tests/testutil.go | 72 +++++++++++++++++++ 42 files changed, 289 insertions(+), 270 deletions(-) diff --git a/tests/autoscaling/autoscaling_test.go b/tests/autoscaling/autoscaling_test.go index 55e29297dbd..663bc92f562 100644 --- a/tests/autoscaling/autoscaling_test.go +++ b/tests/autoscaling/autoscaling_test.go @@ -42,7 +42,7 @@ func TestAPI(t *testing.T) { re.NoError(err) cluster.WaitLeader() - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) var jsonStr = []byte(` diff --git a/tests/cluster.go b/tests/cluster.go index ce8293531cd..c49f3cd982d 100644 --- a/tests/cluster.go +++ b/tests/cluster.go @@ -33,6 +33,7 @@ import ( "github.com/tikv/pd/pkg/errs" "github.com/tikv/pd/pkg/id" "github.com/tikv/pd/pkg/keyspace" + scheduling "github.com/tikv/pd/pkg/mcs/scheduling/server" "github.com/tikv/pd/pkg/mcs/utils" "github.com/tikv/pd/pkg/schedule/schedulers" "github.com/tikv/pd/pkg/swaggerserver" @@ -447,6 +448,7 @@ type TestCluster struct { sync.Mutex pool map[uint64]struct{} } + schedulingCluster *TestSchedulingCluster } // ConfigOption is used to define customize settings in test. @@ -629,6 +631,11 @@ func (c *TestCluster) GetFollower() string { return "" } +// GetLeaderServer returns the leader server of all servers +func (c *TestCluster) GetLeaderServer() *TestServer { + return c.GetServer(c.GetLeader()) +} + // WaitLeader is used to get leader. // If it exceeds the maximum number of loops, it will return an empty string. func (c *TestCluster) WaitLeader(ops ...WaitOption) string { @@ -853,6 +860,19 @@ func (c *TestCluster) CheckTSOUnique(ts uint64) bool { return true } +// GetSchedulingPrimaryServer returns the scheduling primary server. +func (c *TestCluster) GetSchedulingPrimaryServer() *scheduling.Server { + if c.schedulingCluster == nil { + return nil + } + return c.schedulingCluster.GetPrimaryServer() +} + +// SetSchedulingCluster sets the scheduling cluster. +func (c *TestCluster) SetSchedulingCluster(cluster *TestSchedulingCluster) { + c.schedulingCluster = cluster +} + // WaitOp represent the wait configuration type WaitOp struct { retryTimes int diff --git a/tests/compatibility/version_upgrade_test.go b/tests/compatibility/version_upgrade_test.go index 11573e6da2f..8979d85c9bb 100644 --- a/tests/compatibility/version_upgrade_test.go +++ b/tests/compatibility/version_upgrade_test.go @@ -38,7 +38,7 @@ func TestStoreRegister(t *testing.T) { err = cluster.RunInitialServers() re.NoError(err) cluster.WaitLeader() - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) putStoreRequest := &pdpb.PutStoreRequest{ @@ -63,7 +63,7 @@ func TestStoreRegister(t *testing.T) { re.NoError(err) cluster.WaitLeader() - leaderServer = cluster.GetServer(cluster.GetLeader()) + leaderServer = cluster.GetLeaderServer() re.NotNil(leaderServer) newVersion := leaderServer.GetClusterVersion() re.Equal(version, newVersion) @@ -92,7 +92,7 @@ func TestRollingUpgrade(t *testing.T) { err = cluster.RunInitialServers() re.NoError(err) cluster.WaitLeader() - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) stores := []*pdpb.PutStoreRequest{ diff --git a/tests/dashboard/service_test.go b/tests/dashboard/service_test.go index f75e047d8f1..ab3a2c431cb 100644 --- a/tests/dashboard/service_test.go +++ b/tests/dashboard/service_test.go @@ -134,7 +134,7 @@ func (suite *dashboardTestSuite) testDashboard(internalProxy bool) { cluster.WaitLeader() servers := cluster.GetServers() - leader := cluster.GetServer(cluster.GetLeader()) + leader := cluster.GetLeaderServer() leaderAddr := leader.GetAddr() // auto select node diff --git a/tests/integrations/client/client_test.go b/tests/integrations/client/client_test.go index d669e17af90..9cabbb03090 100644 --- a/tests/integrations/client/client_test.go +++ b/tests/integrations/client/client_test.go @@ -347,7 +347,7 @@ func TestUnavailableTimeAfterLeaderIsReady(t *testing.T) { go getTsoFunc() go func() { defer wg.Done() - leader := cluster.GetServer(cluster.GetLeader()) + leader := cluster.GetLeaderServer() leader.Stop() re.NotEmpty(cluster.WaitLeader()) leaderReadyTime = time.Now() @@ -362,7 +362,7 @@ func TestUnavailableTimeAfterLeaderIsReady(t *testing.T) { go getTsoFunc() go func() { defer wg.Done() - leader := cluster.GetServer(cluster.GetLeader()) + leader := cluster.GetLeaderServer() re.NoError(failpoint.Enable("github.com/tikv/pd/client/unreachableNetwork", "return(true)")) leader.Stop() re.NotEmpty(cluster.WaitLeader()) @@ -596,7 +596,7 @@ func TestGetTsoFromFollowerClient2(t *testing.T) { }) lastTS = checkTS(re, cli, lastTS) - re.NoError(cluster.GetServer(cluster.GetLeader()).ResignLeader()) + re.NoError(cluster.GetLeaderServer().ResignLeader()) re.NotEmpty(cluster.WaitLeader()) lastTS = checkTS(re, cli, lastTS) @@ -622,7 +622,7 @@ func runServer(re *require.Assertions, cluster *tests.TestCluster) []string { err := cluster.RunInitialServers() re.NoError(err) re.NotEmpty(cluster.WaitLeader()) - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) testServers := cluster.GetServers() @@ -1439,7 +1439,7 @@ func TestPutGet(t *testing.T) { getResp, err = client.Get(context.Background(), key) re.NoError(err) re.Equal([]byte("2"), getResp.GetKvs()[0].Value) - s := cluster.GetServer(cluster.GetLeader()) + s := cluster.GetLeaderServer() // use etcd client delete the key _, err = s.GetEtcdClient().Delete(context.Background(), string(key)) re.NoError(err) @@ -1459,7 +1459,7 @@ func TestClientWatchWithRevision(t *testing.T) { endpoints := runServer(re, cluster) client := setupCli(re, ctx, endpoints) defer client.Close() - s := cluster.GetServer(cluster.GetLeader()) + s := cluster.GetLeaderServer() watchPrefix := "watch_test" defer func() { _, err := s.GetEtcdClient().Delete(context.Background(), watchPrefix+"test") diff --git a/tests/integrations/mcs/keyspace/tso_keyspace_group_test.go b/tests/integrations/mcs/keyspace/tso_keyspace_group_test.go index 59aabb260ae..af7b31553b3 100644 --- a/tests/integrations/mcs/keyspace/tso_keyspace_group_test.go +++ b/tests/integrations/mcs/keyspace/tso_keyspace_group_test.go @@ -62,7 +62,7 @@ func (suite *keyspaceGroupTestSuite) SetupTest() { suite.NoError(err) suite.NoError(cluster.RunInitialServers()) suite.NotEmpty(cluster.WaitLeader()) - suite.server = cluster.GetServer(cluster.GetLeader()) + suite.server = cluster.GetLeaderServer() suite.NoError(suite.server.BootstrapCluster()) suite.backendEndpoints = suite.server.GetAddr() suite.dialClient = &http.Client{ diff --git a/tests/integrations/mcs/resourcemanager/resource_manager_test.go b/tests/integrations/mcs/resourcemanager/resource_manager_test.go index 546339bee0f..0be18d1bbd3 100644 --- a/tests/integrations/mcs/resourcemanager/resource_manager_test.go +++ b/tests/integrations/mcs/resourcemanager/resource_manager_test.go @@ -903,7 +903,7 @@ func (suite *resourceManagerClientTestSuite) TestBasicResourceGroupCURD() { // Test Resource Group CURD via HTTP finalNum = 1 getAddr := func(i int) string { - server := suite.cluster.GetServer(suite.cluster.GetLeader()) + server := suite.cluster.GetLeaderServer() if i%2 == 1 { server = suite.cluster.GetServer(suite.cluster.GetFollower()) } @@ -1298,7 +1298,7 @@ func (suite *resourceManagerClientTestSuite) TestResourceGroupControllerConfigCh } getAddr := func() string { - server := suite.cluster.GetServer(suite.cluster.GetLeader()) + server := suite.cluster.GetLeaderServer() if rand.Intn(100)%2 == 1 { server = suite.cluster.GetServer(suite.cluster.GetFollower()) } diff --git a/tests/integrations/mcs/scheduling/api_test.go b/tests/integrations/mcs/scheduling/api_test.go index 04671d84798..311c8a3fbed 100644 --- a/tests/integrations/mcs/scheduling/api_test.go +++ b/tests/integrations/mcs/scheduling/api_test.go @@ -45,7 +45,7 @@ func (suite *apiTestSuite) SetupSuite() { suite.NoError(err) suite.NoError(cluster.RunInitialServers()) suite.NotEmpty(cluster.WaitLeader()) - suite.server = cluster.GetServer(cluster.GetLeader()) + suite.server = cluster.GetLeaderServer() suite.NoError(suite.server.BootstrapCluster()) suite.backendEndpoints = suite.server.GetAddr() suite.dialClient = &http.Client{ diff --git a/tests/integrations/mcs/tso/keyspace_group_manager_test.go b/tests/integrations/mcs/tso/keyspace_group_manager_test.go index 3d3fe25b372..d1a4cf35db4 100644 --- a/tests/integrations/mcs/tso/keyspace_group_manager_test.go +++ b/tests/integrations/mcs/tso/keyspace_group_manager_test.go @@ -517,7 +517,7 @@ func TestTwiceSplitKeyspaceGroup(t *testing.T) { re.NoError(err) defer tc.Destroy() tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) tsoCluster, err := tests.NewTestTSOCluster(ctx, 2, pdAddr) @@ -711,7 +711,7 @@ func TestGetTSOImmediately(t *testing.T) { re.NoError(err) defer tc.Destroy() tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) tsoCluster, err := tests.NewTestTSOCluster(ctx, 2, pdAddr) diff --git a/tests/integrations/tso/client_test.go b/tests/integrations/tso/client_test.go index 1d2f437e849..63243214e81 100644 --- a/tests/integrations/tso/client_test.go +++ b/tests/integrations/tso/client_test.go @@ -389,7 +389,7 @@ func (suite *tsoClientTestSuite) TestRandomShutdown() { if !suite.legacy { suite.tsoCluster.WaitForDefaultPrimaryServing(re).Close() } else { - suite.cluster.GetServer(suite.cluster.GetLeader()).GetServer().Close() + suite.cluster.GetLeaderServer().GetServer().Close() } time.Sleep(time.Duration(n) * time.Second) } diff --git a/tests/pdctl/cluster/cluster_test.go b/tests/pdctl/cluster/cluster_test.go index 2b8b8bc8f59..cd4ec6e1391 100644 --- a/tests/pdctl/cluster/cluster_test.go +++ b/tests/pdctl/cluster/cluster_test.go @@ -39,7 +39,7 @@ func TestClusterAndPing(t *testing.T) { err = cluster.RunInitialServers() re.NoError(err) cluster.WaitLeader() - err = cluster.GetServer(cluster.GetLeader()).BootstrapCluster() + err = cluster.GetLeaderServer().BootstrapCluster() re.NoError(err) pdAddr := cluster.GetConfig().GetClientURL() i := strings.Index(pdAddr, "//") diff --git a/tests/pdctl/config/config_test.go b/tests/pdctl/config/config_test.go index f43a964b50c..6ed0841bf74 100644 --- a/tests/pdctl/config/config_test.go +++ b/tests/pdctl/config/config_test.go @@ -64,10 +64,10 @@ func TestConfig(t *testing.T) { Id: 1, State: metapb.StoreState_Up, } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) svr := leaderServer.GetServer() - pdctl.MustPutStore(re, svr, store) + tests.MustPutStore(re, cluster, store) defer cluster.Destroy() // config show @@ -300,10 +300,9 @@ func TestPlacementRules(t *testing.T) { State: metapb.StoreState_Up, LastHeartbeat: time.Now().UnixNano(), } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) - svr := leaderServer.GetServer() - pdctl.MustPutStore(re, svr, store) + tests.MustPutStore(re, cluster, store) defer cluster.Destroy() output, err := pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "placement-rules", "enable") @@ -358,7 +357,7 @@ func TestPlacementRules(t *testing.T) { re.Equal([2]string{"pd", "test1"}, rules2[1].Key()) // test rule region detail - pdctl.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b")) + tests.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b")) fit := &placement.RegionFit{} // need clear up args, so create new a cobra.Command. Otherwise gourp still exists. cmd2 := pdctlCmd.GetRootCmd() @@ -398,10 +397,9 @@ func TestPlacementRuleGroups(t *testing.T) { State: metapb.StoreState_Up, LastHeartbeat: time.Now().UnixNano(), } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) - svr := leaderServer.GetServer() - pdctl.MustPutStore(re, svr, store) + tests.MustPutStore(re, cluster, store) defer cluster.Destroy() output, err := pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "placement-rules", "enable") @@ -473,10 +471,9 @@ func TestPlacementRuleBundle(t *testing.T) { State: metapb.StoreState_Up, LastHeartbeat: time.Now().UnixNano(), } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) - svr := leaderServer.GetServer() - pdctl.MustPutStore(re, svr, store) + tests.MustPutStore(re, cluster, store) defer cluster.Destroy() output, err := pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "placement-rules", "enable") @@ -609,10 +606,9 @@ func TestReplicationMode(t *testing.T) { State: metapb.StoreState_Up, LastHeartbeat: time.Now().UnixNano(), } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) - svr := leaderServer.GetServer() - pdctl.MustPutStore(re, svr, store) + tests.MustPutStore(re, cluster, store) defer cluster.Destroy() conf := config.ReplicationModeConfig{ @@ -668,10 +664,9 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { Id: 1, State: metapb.StoreState_Up, } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) - svr := leaderServer.GetServer() - pdctl.MustPutStore(re, svr, store) + tests.MustPutStore(re, cluster, store) defer cluster.Destroy() checkMaxReplicas := func(expect uint64) { @@ -813,10 +808,9 @@ func TestPDServerConfig(t *testing.T) { State: metapb.StoreState_Up, LastHeartbeat: time.Now().UnixNano(), } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) - svr := leaderServer.GetServer() - pdctl.MustPutStore(re, svr, store) + tests.MustPutStore(re, cluster, store) defer cluster.Destroy() output, err := pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "show", "server") diff --git a/tests/pdctl/health/health_test.go b/tests/pdctl/health/health_test.go index bc808a36750..748250babe4 100644 --- a/tests/pdctl/health/health_test.go +++ b/tests/pdctl/health/health_test.go @@ -36,7 +36,7 @@ func TestHealth(t *testing.T) { err = tc.RunInitialServers() re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) pdAddr := tc.GetConfig().GetClientURL() cmd := pdctlCmd.GetRootCmd() diff --git a/tests/pdctl/helper.go b/tests/pdctl/helper.go index d7d6a858497..3912cdfef7c 100644 --- a/tests/pdctl/helper.go +++ b/tests/pdctl/helper.go @@ -16,21 +16,13 @@ package pdctl import ( "bytes" - "context" - "fmt" "sort" - "github.com/docker/go-units" - "github.com/pingcap/kvproto/pkg/metapb" - "github.com/pingcap/kvproto/pkg/pdpb" "github.com/spf13/cobra" "github.com/stretchr/testify/require" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/utils/typeutil" - "github.com/tikv/pd/pkg/versioninfo" - "github.com/tikv/pd/server" "github.com/tikv/pd/server/api" - "github.com/tikv/pd/tests" ) // ExecuteCommand is used for test purpose. @@ -89,59 +81,3 @@ func CheckRegionsInfo(re *require.Assertions, output *api.RegionsInfo, expected CheckRegionInfo(re, &got[i], region) } } - -// MustPutStore is used for test purpose. -func MustPutStore(re *require.Assertions, svr *server.Server, store *metapb.Store) { - store.Address = fmt.Sprintf("tikv%d", store.GetId()) - if len(store.Version) == 0 { - store.Version = versioninfo.MinSupportedVersion(versioninfo.Version2_0).String() - } - grpcServer := &server.GrpcServer{Server: svr} - _, err := grpcServer.PutStore(context.Background(), &pdpb.PutStoreRequest{ - Header: &pdpb.RequestHeader{ClusterId: svr.ClusterID()}, - Store: store, - }) - re.NoError(err) - - storeInfo := grpcServer.GetRaftCluster().GetStore(store.GetId()) - newStore := storeInfo.Clone(core.SetStoreStats(&pdpb.StoreStats{ - Capacity: uint64(10 * units.GiB), - UsedSize: uint64(9 * units.GiB), - Available: uint64(1 * units.GiB), - })) - grpcServer.GetRaftCluster().GetBasicCluster().PutStore(newStore) -} - -// MustPutRegion is used for test purpose. -func MustPutRegion(re *require.Assertions, cluster *tests.TestCluster, regionID, storeID uint64, start, end []byte, opts ...core.RegionCreateOption) *core.RegionInfo { - leader := &metapb.Peer{ - Id: regionID, - StoreId: storeID, - } - metaRegion := &metapb.Region{ - Id: regionID, - StartKey: start, - EndKey: end, - Peers: []*metapb.Peer{leader}, - RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 1}, - } - r := core.NewRegionInfo(metaRegion, leader, opts...) - err := cluster.HandleRegionHeartbeat(r) - re.NoError(err) - return r -} - -// MustReportBuckets is used for test purpose. -func MustReportBuckets(re *require.Assertions, cluster *tests.TestCluster, regionID uint64, start, end []byte, stats *metapb.BucketStats) *metapb.Buckets { - buckets := &metapb.Buckets{ - RegionId: regionID, - Version: 1, - Keys: [][]byte{start, end}, - Stats: stats, - // report buckets interval is 10s - PeriodInMs: 10000, - } - err := cluster.HandleReportBuckets(buckets) - re.NoError(err) - return buckets -} diff --git a/tests/pdctl/hot/hot_test.go b/tests/pdctl/hot/hot_test.go index 352b891c092..359d89199c9 100644 --- a/tests/pdctl/hot/hot_test.go +++ b/tests/pdctl/hot/hot_test.go @@ -63,10 +63,10 @@ func TestHot(t *testing.T) { Labels: []*metapb.StoreLabel{{Key: "engine", Value: "tiflash"}}, } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) - pdctl.MustPutStore(re, leaderServer.GetServer(), store1) - pdctl.MustPutStore(re, leaderServer.GetServer(), store2) + tests.MustPutStore(re, cluster, store1) + tests.MustPutStore(re, cluster, store2) defer cluster.Destroy() // test hot store @@ -159,7 +159,7 @@ func TestHot(t *testing.T) { } testHot(hotRegionID, hotStoreID, "read") case "write": - pdctl.MustPutRegion( + tests.MustPutRegion( re, cluster, hotRegionID, hotStoreID, []byte("c"), []byte("d"), @@ -222,16 +222,16 @@ func TestHotWithStoreID(t *testing.T) { }, } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) for _, store := range stores { - pdctl.MustPutStore(re, leaderServer.GetServer(), store) + tests.MustPutStore(re, cluster, store) } defer cluster.Destroy() - pdctl.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetWrittenBytes(3000000000), core.SetReportInterval(0, utils.RegionHeartBeatReportInterval)) - pdctl.MustPutRegion(re, cluster, 2, 2, []byte("c"), []byte("d"), core.SetWrittenBytes(6000000000), core.SetReportInterval(0, utils.RegionHeartBeatReportInterval)) - pdctl.MustPutRegion(re, cluster, 3, 1, []byte("e"), []byte("f"), core.SetWrittenBytes(9000000000), core.SetReportInterval(0, utils.RegionHeartBeatReportInterval)) + tests.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetWrittenBytes(3000000000), core.SetReportInterval(0, utils.RegionHeartBeatReportInterval)) + tests.MustPutRegion(re, cluster, 2, 2, []byte("c"), []byte("d"), core.SetWrittenBytes(6000000000), core.SetReportInterval(0, utils.RegionHeartBeatReportInterval)) + tests.MustPutRegion(re, cluster, 3, 1, []byte("e"), []byte("f"), core.SetWrittenBytes(9000000000), core.SetReportInterval(0, utils.RegionHeartBeatReportInterval)) // wait hot scheduler starts rc := leaderServer.GetRaftCluster() testutil.Eventually(re, func() bool { @@ -267,7 +267,7 @@ func TestHotWithStoreID(t *testing.T) { WriteBytes: []uint64{13 * units.MiB}, WriteQps: []uint64{0}, } - buckets := pdctl.MustReportBuckets(re, cluster, 1, []byte("a"), []byte("b"), stats) + buckets := tests.MustReportBuckets(re, cluster, 1, []byte("a"), []byte("b"), stats) args = []string{"-u", pdAddr, "hot", "buckets", "1"} output, err = pdctl.ExecuteCommand(cmd, args...) re.NoError(err) @@ -330,20 +330,20 @@ func TestHistoryHotRegions(t *testing.T) { }, } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) for _, store := range stores { - pdctl.MustPutStore(re, leaderServer.GetServer(), store) + tests.MustPutStore(re, cluster, store) } defer cluster.Destroy() startTime := time.Now().Unix() - pdctl.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetWrittenBytes(3000000000), + tests.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetWrittenBytes(3000000000), core.SetReportInterval(uint64(startTime-utils.RegionHeartBeatReportInterval), uint64(startTime))) - pdctl.MustPutRegion(re, cluster, 2, 2, []byte("c"), []byte("d"), core.SetWrittenBytes(6000000000), + tests.MustPutRegion(re, cluster, 2, 2, []byte("c"), []byte("d"), core.SetWrittenBytes(6000000000), core.SetReportInterval(uint64(startTime-utils.RegionHeartBeatReportInterval), uint64(startTime))) - pdctl.MustPutRegion(re, cluster, 3, 1, []byte("e"), []byte("f"), core.SetWrittenBytes(9000000000), + tests.MustPutRegion(re, cluster, 3, 1, []byte("e"), []byte("f"), core.SetWrittenBytes(9000000000), core.SetReportInterval(uint64(startTime-utils.RegionHeartBeatReportInterval), uint64(startTime))) - pdctl.MustPutRegion(re, cluster, 4, 3, []byte("g"), []byte("h"), core.SetWrittenBytes(9000000000), + tests.MustPutRegion(re, cluster, 4, 3, []byte("g"), []byte("h"), core.SetWrittenBytes(9000000000), core.SetReportInterval(uint64(startTime-utils.RegionHeartBeatReportInterval), uint64(startTime))) // wait hot scheduler starts testutil.Eventually(re, func() bool { @@ -440,10 +440,10 @@ func TestHotWithoutHotPeer(t *testing.T) { }, } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) for _, store := range stores { - pdctl.MustPutStore(re, leaderServer.GetServer(), store) + tests.MustPutStore(re, cluster, store) } timestamp := uint64(time.Now().UnixNano()) load := 1024.0 diff --git a/tests/pdctl/keyspace/keyspace_group_test.go b/tests/pdctl/keyspace/keyspace_group_test.go index 105e860ad17..0b09550d967 100644 --- a/tests/pdctl/keyspace/keyspace_group_test.go +++ b/tests/pdctl/keyspace/keyspace_group_test.go @@ -44,7 +44,7 @@ func TestKeyspaceGroup(t *testing.T) { err = tc.RunInitialServers() re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) pdAddr := tc.GetConfig().GetClientURL() cmd := pdctlCmd.GetRootCmd() @@ -113,7 +113,7 @@ func TestSplitKeyspaceGroup(t *testing.T) { cmd := pdctlCmd.GetRootCmd() tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) // split keyspace group. @@ -164,7 +164,7 @@ func TestExternalAllocNodeWhenStart(t *testing.T) { cmd := pdctlCmd.GetRootCmd() tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) // check keyspace group information. @@ -207,7 +207,7 @@ func TestSetNodeAndPriorityKeyspaceGroup(t *testing.T) { cmd := pdctlCmd.GetRootCmd() tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) // set-node keyspace group. @@ -309,7 +309,7 @@ func TestMergeKeyspaceGroup(t *testing.T) { cmd := pdctlCmd.GetRootCmd() tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) // split keyspace group. @@ -427,7 +427,7 @@ func TestKeyspaceGroupState(t *testing.T) { cmd := pdctlCmd.GetRootCmd() tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) // split keyspace group. @@ -518,7 +518,7 @@ func TestShowKeyspaceGroupPrimary(t *testing.T) { cmd := pdctlCmd.GetRootCmd() tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) defaultKeyspaceGroupID := fmt.Sprintf("%d", utils.DefaultKeyspaceGroupID) @@ -600,7 +600,7 @@ func TestInPDMode(t *testing.T) { pdAddr := tc.GetConfig().GetClientURL() cmd := pdctlCmd.GetRootCmd() tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) argsList := [][]string{ diff --git a/tests/pdctl/keyspace/keyspace_test.go b/tests/pdctl/keyspace/keyspace_test.go index a0bab4114df..57acdc86c70 100644 --- a/tests/pdctl/keyspace/keyspace_test.go +++ b/tests/pdctl/keyspace/keyspace_test.go @@ -58,7 +58,7 @@ func TestKeyspace(t *testing.T) { cmd := pdctlCmd.GetRootCmd() tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) defaultKeyspaceGroupID := fmt.Sprintf("%d", utils.DefaultKeyspaceGroupID) diff --git a/tests/pdctl/label/label_test.go b/tests/pdctl/label/label_test.go index ba31b1fb1d1..9c64933a127 100644 --- a/tests/pdctl/label/label_test.go +++ b/tests/pdctl/label/label_test.go @@ -92,11 +92,11 @@ func TestLabel(t *testing.T) { }, }, } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) for _, store := range stores { - pdctl.MustPutStore(re, leaderServer.GetServer(), store.Store.Store) + tests.MustPutStore(re, cluster, store.Store.Store) } defer cluster.Destroy() diff --git a/tests/pdctl/log/log_test.go b/tests/pdctl/log/log_test.go index 7f2e4f20584..e6995231329 100644 --- a/tests/pdctl/log/log_test.go +++ b/tests/pdctl/log/log_test.go @@ -21,7 +21,6 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/stretchr/testify/suite" - "github.com/tikv/pd/server" "github.com/tikv/pd/tests" "github.com/tikv/pd/tests/pdctl" pdctlCmd "github.com/tikv/pd/tools/pd-ctl/pdctl" @@ -32,7 +31,6 @@ type logTestSuite struct { ctx context.Context cancel context.CancelFunc cluster *tests.TestCluster - svr *server.Server pdAddrs []string } @@ -54,10 +52,9 @@ func (suite *logTestSuite) SetupSuite() { State: metapb.StoreState_Up, LastHeartbeat: time.Now().UnixNano(), } - leaderServer := suite.cluster.GetServer(suite.cluster.GetLeader()) + leaderServer := suite.cluster.GetLeaderServer() suite.NoError(leaderServer.BootstrapCluster()) - suite.svr = leaderServer.GetServer() - pdctl.MustPutStore(suite.Require(), suite.svr, store) + tests.MustPutStore(suite.Require(), suite.cluster, store) } func (suite *logTestSuite) TearDownSuite() { @@ -97,7 +94,7 @@ func (suite *logTestSuite) TestLog() { for _, testCase := range testCases { _, err := pdctl.ExecuteCommand(cmd, testCase.cmd...) suite.NoError(err) - suite.Equal(testCase.expect, suite.svr.GetConfig().Log.Level) + suite.Equal(testCase.expect, suite.cluster.GetLeaderServer().GetConfig().Log.Level) } } diff --git a/tests/pdctl/member/member_test.go b/tests/pdctl/member/member_test.go index 9c787499253..af3ee771e82 100644 --- a/tests/pdctl/member/member_test.go +++ b/tests/pdctl/member/member_test.go @@ -38,7 +38,7 @@ func TestMember(t *testing.T) { err = cluster.RunInitialServers() re.NoError(err) cluster.WaitLeader() - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) pdAddr := cluster.GetConfig().GetClientURL() re.NoError(err) diff --git a/tests/pdctl/operator/operator_test.go b/tests/pdctl/operator/operator_test.go index 148cbc9e081..a95c620adcf 100644 --- a/tests/pdctl/operator/operator_test.go +++ b/tests/pdctl/operator/operator_test.go @@ -79,17 +79,17 @@ func TestOperator(t *testing.T) { }, } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) for _, store := range stores { - pdctl.MustPutStore(re, leaderServer.GetServer(), store) + tests.MustPutStore(re, cluster, store) } - pdctl.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetPeers([]*metapb.Peer{ + tests.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetPeers([]*metapb.Peer{ {Id: 1, StoreId: 1}, {Id: 2, StoreId: 2}, })) - pdctl.MustPutRegion(re, cluster, 3, 2, []byte("b"), []byte("d"), core.SetPeers([]*metapb.Peer{ + tests.MustPutRegion(re, cluster, 3, 2, []byte("b"), []byte("d"), core.SetPeers([]*metapb.Peer{ {Id: 3, StoreId: 1}, {Id: 4, StoreId: 2}, })) @@ -261,7 +261,7 @@ func TestForwardOperatorRequest(t *testing.T) { re.NoError(err) re.NoError(cluster.RunInitialServers()) re.NotEmpty(cluster.WaitLeader()) - server := cluster.GetServer(cluster.GetLeader()) + server := cluster.GetLeaderServer() re.NoError(server.BootstrapCluster()) backendEndpoints := server.GetAddr() tc, err := tests.NewTestSchedulingCluster(ctx, 2, backendEndpoints) diff --git a/tests/pdctl/region/region_test.go b/tests/pdctl/region/region_test.go index d56463d728d..b913f1b0923 100644 --- a/tests/pdctl/region/region_test.go +++ b/tests/pdctl/region/region_test.go @@ -45,9 +45,9 @@ func TestRegionKeyFormat(t *testing.T) { State: metapb.StoreState_Up, LastHeartbeat: time.Now().UnixNano(), } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) - pdctl.MustPutStore(re, leaderServer.GetServer(), store) + tests.MustPutStore(re, cluster, store) cmd := pdctlCmd.GetRootCmd() output, err := pdctl.ExecuteCommand(cmd, "-u", url, "region", "key", "--format=raw", " ") @@ -72,12 +72,12 @@ func TestRegion(t *testing.T) { State: metapb.StoreState_Up, LastHeartbeat: time.Now().UnixNano(), } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) - pdctl.MustPutStore(re, leaderServer.GetServer(), store) + tests.MustPutStore(re, cluster, store) downPeer := &metapb.Peer{Id: 8, StoreId: 3} - r1 := pdctl.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), + r1 := tests.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetWrittenBytes(1000), core.SetReadBytes(1000), core.SetRegionConfVer(1), core.SetRegionVersion(1), core.SetApproximateSize(1), core.SetApproximateKeys(100), core.SetPeers([]*metapb.Peer{ @@ -86,16 +86,16 @@ func TestRegion(t *testing.T) { {Id: 6, StoreId: 3}, {Id: 7, StoreId: 4}, })) - r2 := pdctl.MustPutRegion(re, cluster, 2, 1, []byte("b"), []byte("c"), + r2 := tests.MustPutRegion(re, cluster, 2, 1, []byte("b"), []byte("c"), core.SetWrittenBytes(2000), core.SetReadBytes(0), core.SetRegionConfVer(2), core.SetRegionVersion(3), core.SetApproximateSize(144), core.SetApproximateKeys(14400), ) - r3 := pdctl.MustPutRegion(re, cluster, 3, 1, []byte("c"), []byte("d"), + r3 := tests.MustPutRegion(re, cluster, 3, 1, []byte("c"), []byte("d"), core.SetWrittenBytes(500), core.SetReadBytes(800), core.SetRegionConfVer(3), core.SetRegionVersion(2), core.SetApproximateSize(30), core.SetApproximateKeys(3000), core.WithDownPeers([]*pdpb.PeerStats{{Peer: downPeer, DownSeconds: 3600}}), core.WithPendingPeers([]*metapb.Peer{downPeer}), core.WithLearners([]*metapb.Peer{{Id: 3, StoreId: 1}})) - r4 := pdctl.MustPutRegion(re, cluster, 4, 1, []byte("d"), []byte("e"), + r4 := tests.MustPutRegion(re, cluster, 4, 1, []byte("d"), []byte("e"), core.SetWrittenBytes(100), core.SetReadBytes(100), core.SetRegionConfVer(1), core.SetRegionVersion(1), core.SetApproximateSize(10), core.SetApproximateKeys(1000), ) @@ -197,7 +197,7 @@ func TestRegion(t *testing.T) { } // Test region range-holes. - r5 := pdctl.MustPutRegion(re, cluster, 5, 1, []byte("x"), []byte("z")) + r5 := tests.MustPutRegion(re, cluster, 5, 1, []byte("x"), []byte("z")) output, err := pdctl.ExecuteCommand(cmd, []string{"-u", pdAddr, "region", "range-holes"}...) re.NoError(err) rangeHoles := new([][]string) diff --git a/tests/pdctl/scheduler/scheduler_test.go b/tests/pdctl/scheduler/scheduler_test.go index 31e6270aa3b..f2d44a589a4 100644 --- a/tests/pdctl/scheduler/scheduler_test.go +++ b/tests/pdctl/scheduler/scheduler_test.go @@ -94,14 +94,14 @@ func TestScheduler(t *testing.T) { re.Equal(expectedConfig, configInfo) } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) for _, store := range stores { - pdctl.MustPutStore(re, leaderServer.GetServer(), store) + tests.MustPutStore(re, cluster, store) } // note: because pdqsort is a unstable sort algorithm, set ApproximateSize for this region. - pdctl.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetApproximateSize(10)) + tests.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetApproximateSize(10)) time.Sleep(3 * time.Second) // scheduler show command @@ -363,7 +363,7 @@ func TestScheduler(t *testing.T) { for _, store := range stores { version := versioninfo.HotScheduleWithQuery store.Version = versioninfo.MinSupportedVersion(version).String() - pdctl.MustPutStore(re, leaderServer.GetServer(), store) + tests.MustPutStore(re, cluster, store) } re.Equal("5.2.0", leaderServer.GetClusterVersion().String()) // After upgrading, we should not use query. @@ -488,14 +488,14 @@ func TestSchedulerDiagnostic(t *testing.T) { LastHeartbeat: time.Now().UnixNano(), }, } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) for _, store := range stores { - pdctl.MustPutStore(re, leaderServer.GetServer(), store) + tests.MustPutStore(re, cluster, store) } // note: because pdqsort is a unstable sort algorithm, set ApproximateSize for this region. - pdctl.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetApproximateSize(10)) + tests.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetApproximateSize(10)) time.Sleep(3 * time.Second) echo := mustExec(re, cmd, []string{"-u", pdAddr, "config", "set", "enable-diagnostic", "true"}, nil) @@ -539,7 +539,7 @@ func TestForwardSchedulerRequest(t *testing.T) { re.NoError(err) re.NoError(cluster.RunInitialServers()) re.NotEmpty(cluster.WaitLeader()) - server := cluster.GetServer(cluster.GetLeader()) + server := cluster.GetLeaderServer() re.NoError(server.BootstrapCluster()) backendEndpoints := server.GetAddr() tc, err := tests.NewTestSchedulingCluster(ctx, 2, backendEndpoints) diff --git a/tests/pdctl/store/store_test.go b/tests/pdctl/store/store_test.go index 0ac68e35d98..13c7350bb6f 100644 --- a/tests/pdctl/store/store_test.go +++ b/tests/pdctl/store/store_test.go @@ -79,11 +79,11 @@ func TestStore(t *testing.T) { }, } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) for _, store := range stores { - pdctl.MustPutStore(re, leaderServer.GetServer(), store.Store.Store) + tests.MustPutStore(re, cluster, store.Store.Store) } defer cluster.Destroy() @@ -293,7 +293,7 @@ func TestStore(t *testing.T) { NodeState: metapb.NodeState_Serving, LastHeartbeat: time.Now().UnixNano(), } - pdctl.MustPutStore(re, leaderServer.GetServer(), store2) + tests.MustPutStore(re, cluster, store2) } // store delete command @@ -506,15 +506,15 @@ func TestTombstoneStore(t *testing.T) { }, } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) for _, store := range stores { - pdctl.MustPutStore(re, leaderServer.GetServer(), store.Store.Store) + tests.MustPutStore(re, cluster, store.Store.Store) } defer cluster.Destroy() - pdctl.MustPutRegion(re, cluster, 1, 2, []byte("a"), []byte("b"), core.SetWrittenBytes(3000000000), core.SetReportInterval(0, utils.RegionHeartBeatReportInterval)) - pdctl.MustPutRegion(re, cluster, 2, 3, []byte("b"), []byte("c"), core.SetWrittenBytes(3000000000), core.SetReportInterval(0, utils.RegionHeartBeatReportInterval)) + tests.MustPutRegion(re, cluster, 1, 2, []byte("a"), []byte("b"), core.SetWrittenBytes(3000000000), core.SetReportInterval(0, utils.RegionHeartBeatReportInterval)) + tests.MustPutRegion(re, cluster, 2, 3, []byte("b"), []byte("c"), core.SetWrittenBytes(3000000000), core.SetReportInterval(0, utils.RegionHeartBeatReportInterval)) // store remove-tombstone args := []string{"-u", pdAddr, "store", "remove-tombstone"} output, err := pdctl.ExecuteCommand(cmd, args...) diff --git a/tests/pdctl/unsafe/unsafe_operation_test.go b/tests/pdctl/unsafe/unsafe_operation_test.go index 1e4e3468225..e0fdb983591 100644 --- a/tests/pdctl/unsafe/unsafe_operation_test.go +++ b/tests/pdctl/unsafe/unsafe_operation_test.go @@ -33,7 +33,7 @@ func TestRemoveFailedStores(t *testing.T) { err = cluster.RunInitialServers() re.NoError(err) cluster.WaitLeader() - err = cluster.GetServer(cluster.GetLeader()).BootstrapCluster() + err = cluster.GetLeaderServer().BootstrapCluster() re.NoError(err) pdAddr := cluster.GetConfig().GetClientURL() cmd := pdctlCmd.GetRootCmd() diff --git a/tests/registry/registry_test.go b/tests/registry/registry_test.go index da68bddd354..a3aff76a1cf 100644 --- a/tests/registry/registry_test.go +++ b/tests/registry/registry_test.go @@ -76,8 +76,8 @@ func TestRegistryService(t *testing.T) { err = cluster.RunInitialServers() re.NoError(err) - leaderName := cluster.WaitLeader() - leader := cluster.GetServer(leaderName) + re.NotEmpty(cluster.WaitLeader()) + leader := cluster.GetLeaderServer() // Test registered GRPC Service cc, err := grpc.DialContext(ctx, strings.TrimPrefix(leader.GetAddr(), "http://"), grpc.WithInsecure()) diff --git a/tests/server/api/api_test.go b/tests/server/api/api_test.go index cc35d9eaab3..ff430f1b848 100644 --- a/tests/server/api/api_test.go +++ b/tests/server/api/api_test.go @@ -40,7 +40,6 @@ import ( "github.com/tikv/pd/server/api" "github.com/tikv/pd/server/config" "github.com/tikv/pd/tests" - "github.com/tikv/pd/tests/pdctl" "go.uber.org/goleak" ) @@ -64,6 +63,7 @@ func TestReconnect(t *testing.T) { // Make connections to followers. // Make sure they proxy requests to the leader. leader := cluster.WaitLeader() + re.NotEmpty(leader) for name, s := range cluster.GetServers() { if name != leader { res, err := http.Get(s.GetConfig().AdvertiseClientUrls + "/pd/api/v1/version") @@ -136,7 +136,7 @@ func (suite *middlewareTestSuite) TearDownSuite() { func (suite *middlewareTestSuite) TestRequestInfoMiddleware() { suite.NoError(failpoint.Enable("github.com/tikv/pd/server/api/addRequestInfoMiddleware", "return(true)")) - leader := suite.cluster.GetServer(suite.cluster.GetLeader()) + leader := suite.cluster.GetLeaderServer() suite.NotNil(leader) input := map[string]interface{}{ @@ -190,7 +190,7 @@ func BenchmarkDoRequestWithServiceMiddleware(b *testing.B) { cluster, _ := tests.NewTestCluster(ctx, 1) cluster.RunInitialServers() cluster.WaitLeader() - leader := cluster.GetServer(cluster.GetLeader()) + leader := cluster.GetLeaderServer() input := map[string]interface{}{ "enable-audit": "true", } @@ -207,7 +207,7 @@ func BenchmarkDoRequestWithServiceMiddleware(b *testing.B) { } func (suite *middlewareTestSuite) TestRateLimitMiddleware() { - leader := suite.cluster.GetServer(suite.cluster.GetLeader()) + leader := suite.cluster.GetLeaderServer() suite.NotNil(leader) input := map[string]interface{}{ "enable-rate-limit": "true", @@ -296,7 +296,7 @@ func (suite *middlewareTestSuite) TestRateLimitMiddleware() { servers = append(servers, s.GetServer()) } server.MustWaitLeader(suite.Require(), servers) - leader = suite.cluster.GetServer(suite.cluster.GetLeader()) + leader = suite.cluster.GetLeaderServer() suite.Equal(leader.GetServer().GetServiceMiddlewarePersistOptions().IsRateLimitEnabled(), true) cfg, ok := leader.GetServer().GetRateLimitConfig().LimiterConfig["SetLogLevel"] suite.Equal(ok, true) @@ -372,7 +372,7 @@ func (suite *middlewareTestSuite) TestRateLimitMiddleware() { } func (suite *middlewareTestSuite) TestSwaggerUrl() { - leader := suite.cluster.GetServer(suite.cluster.GetLeader()) + leader := suite.cluster.GetLeaderServer() suite.NotNil(leader) req, _ := http.NewRequest(http.MethodGet, leader.GetAddr()+"/swagger/ui/index", nil) resp, err := dialClient.Do(req) @@ -382,7 +382,7 @@ func (suite *middlewareTestSuite) TestSwaggerUrl() { } func (suite *middlewareTestSuite) TestAuditPrometheusBackend() { - leader := suite.cluster.GetServer(suite.cluster.GetLeader()) + leader := suite.cluster.GetLeaderServer() suite.NotNil(leader) input := map[string]interface{}{ "enable-audit": "true", @@ -418,7 +418,7 @@ func (suite *middlewareTestSuite) TestAuditPrometheusBackend() { servers = append(servers, s.GetServer()) } server.MustWaitLeader(suite.Require(), servers) - leader = suite.cluster.GetServer(suite.cluster.GetLeader()) + leader = suite.cluster.GetLeaderServer() timeUnix = time.Now().Unix() - 20 req, _ = http.NewRequest(http.MethodGet, fmt.Sprintf("%s/pd/api/v1/trend?from=%d", leader.GetAddr(), timeUnix), nil) @@ -451,7 +451,7 @@ func (suite *middlewareTestSuite) TestAuditPrometheusBackend() { func (suite *middlewareTestSuite) TestAuditLocalLogBackend() { fname := testutil.InitTempFileLogger("info") defer os.RemoveAll(fname) - leader := suite.cluster.GetServer(suite.cluster.GetLeader()) + leader := suite.cluster.GetLeaderServer() suite.NotNil(leader) input := map[string]interface{}{ "enable-audit": "true", @@ -481,7 +481,7 @@ func BenchmarkDoRequestWithLocalLogAudit(b *testing.B) { cluster, _ := tests.NewTestCluster(ctx, 1) cluster.RunInitialServers() cluster.WaitLeader() - leader := cluster.GetServer(cluster.GetLeader()) + leader := cluster.GetLeaderServer() input := map[string]interface{}{ "enable-audit": "true", } @@ -503,7 +503,7 @@ func BenchmarkDoRequestWithPrometheusAudit(b *testing.B) { cluster, _ := tests.NewTestCluster(ctx, 1) cluster.RunInitialServers() cluster.WaitLeader() - leader := cluster.GetServer(cluster.GetLeader()) + leader := cluster.GetLeaderServer() input := map[string]interface{}{ "enable-audit": "true", } @@ -525,7 +525,7 @@ func BenchmarkDoRequestWithoutServiceMiddleware(b *testing.B) { cluster, _ := tests.NewTestCluster(ctx, 1) cluster.RunInitialServers() cluster.WaitLeader() - leader := cluster.GetServer(cluster.GetLeader()) + leader := cluster.GetLeaderServer() input := map[string]interface{}{ "enable-audit": "false", } @@ -586,7 +586,7 @@ func (suite *redirectorTestSuite) TearDownSuite() { func (suite *redirectorTestSuite) TestRedirect() { re := suite.Require() - leader := suite.cluster.GetServer(suite.cluster.GetLeader()) + leader := suite.cluster.GetLeaderServer() suite.NotNil(leader) header := mustRequestSuccess(re, leader.GetServer()) header.Del("Date") @@ -602,7 +602,7 @@ func (suite *redirectorTestSuite) TestRedirect() { func (suite *redirectorTestSuite) TestAllowFollowerHandle() { // Find a follower. var follower *server.Server - leader := suite.cluster.GetServer(suite.cluster.GetLeader()) + leader := suite.cluster.GetLeaderServer() for _, svr := range suite.cluster.GetServers() { if svr != leader { follower = svr.GetServer() @@ -626,7 +626,7 @@ func (suite *redirectorTestSuite) TestAllowFollowerHandle() { func (suite *redirectorTestSuite) TestNotLeader() { // Find a follower. var follower *server.Server - leader := suite.cluster.GetServer(suite.cluster.GetLeader()) + leader := suite.cluster.GetLeaderServer() for _, svr := range suite.cluster.GetServers() { if svr != leader { follower = svr.GetServer() @@ -657,7 +657,7 @@ func (suite *redirectorTestSuite) TestNotLeader() { } func (suite *redirectorTestSuite) TestXForwardedFor() { - leader := suite.cluster.GetServer(suite.cluster.GetLeader()) + leader := suite.cluster.GetLeaderServer() suite.NoError(leader.BootstrapCluster()) fname := testutil.InitTempFileLogger("info") defer os.RemoveAll(fname) @@ -702,7 +702,7 @@ func TestRemovingProgress(t *testing.T) { re.NoError(err) cluster.WaitLeader() - leader := cluster.GetServer(cluster.GetLeader()) + leader := cluster.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leader.GetAddr()) clusterID := leader.GetClusterID() req := &pdpb.BootstrapRequest{ @@ -735,12 +735,12 @@ func TestRemovingProgress(t *testing.T) { } for _, store := range stores { - pdctl.MustPutStore(re, leader.GetServer(), store) + tests.MustPutStore(re, cluster, store) } - pdctl.MustPutRegion(re, cluster, 1000, 1, []byte("a"), []byte("b"), core.SetApproximateSize(60)) - pdctl.MustPutRegion(re, cluster, 1001, 2, []byte("c"), []byte("d"), core.SetApproximateSize(30)) - pdctl.MustPutRegion(re, cluster, 1002, 1, []byte("e"), []byte("f"), core.SetApproximateSize(50)) - pdctl.MustPutRegion(re, cluster, 1003, 2, []byte("g"), []byte("h"), core.SetApproximateSize(40)) + tests.MustPutRegion(re, cluster, 1000, 1, []byte("a"), []byte("b"), core.SetApproximateSize(60)) + tests.MustPutRegion(re, cluster, 1001, 2, []byte("c"), []byte("d"), core.SetApproximateSize(30)) + tests.MustPutRegion(re, cluster, 1002, 1, []byte("e"), []byte("f"), core.SetApproximateSize(50)) + tests.MustPutRegion(re, cluster, 1003, 2, []byte("g"), []byte("h"), core.SetApproximateSize(40)) // no store removing output := sendRequest(re, leader.GetAddr()+"/pd/api/v1/stores/progress?action=removing", http.MethodGet, http.StatusNotFound) @@ -762,8 +762,8 @@ func TestRemovingProgress(t *testing.T) { re.Equal(math.MaxFloat64, p.LeftSeconds) // update size - pdctl.MustPutRegion(re, cluster, 1000, 1, []byte("a"), []byte("b"), core.SetApproximateSize(20)) - pdctl.MustPutRegion(re, cluster, 1001, 2, []byte("c"), []byte("d"), core.SetApproximateSize(10)) + tests.MustPutRegion(re, cluster, 1000, 1, []byte("a"), []byte("b"), core.SetApproximateSize(20)) + tests.MustPutRegion(re, cluster, 1001, 2, []byte("c"), []byte("d"), core.SetApproximateSize(10)) // is not prepared time.Sleep(2 * time.Second) @@ -817,7 +817,8 @@ func TestSendApiWhenRestartRaftCluster(t *testing.T) { err = cluster.RunInitialServers() re.NoError(err) - leader := cluster.GetServer(cluster.WaitLeader()) + re.NotEmpty(cluster.WaitLeader()) + leader := cluster.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leader.GetAddr()) clusterID := leader.GetClusterID() @@ -860,7 +861,7 @@ func TestPreparingProgress(t *testing.T) { re.NoError(err) cluster.WaitLeader() - leader := cluster.GetServer(cluster.GetLeader()) + leader := cluster.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leader.GetAddr()) clusterID := leader.GetClusterID() req := &pdpb.BootstrapRequest{ @@ -910,10 +911,10 @@ func TestPreparingProgress(t *testing.T) { } for _, store := range stores { - pdctl.MustPutStore(re, leader.GetServer(), store) + tests.MustPutStore(re, cluster, store) } for i := 0; i < 100; i++ { - pdctl.MustPutRegion(re, cluster, uint64(i+1), uint64(i)%3+1, []byte(fmt.Sprintf("p%d", i)), []byte(fmt.Sprintf("%d", i+1)), core.SetApproximateSize(10)) + tests.MustPutRegion(re, cluster, uint64(i+1), uint64(i)%3+1, []byte(fmt.Sprintf("p%d", i)), []byte(fmt.Sprintf("%d", i+1)), core.SetApproximateSize(10)) } // no store preparing output := sendRequest(re, leader.GetAddr()+"/pd/api/v1/stores/progress?action=preparing", http.MethodGet, http.StatusNotFound) @@ -940,8 +941,8 @@ func TestPreparingProgress(t *testing.T) { re.Equal(math.MaxFloat64, p.LeftSeconds) // update size - pdctl.MustPutRegion(re, cluster, 1000, 4, []byte(fmt.Sprintf("%d", 1000)), []byte(fmt.Sprintf("%d", 1001)), core.SetApproximateSize(10)) - pdctl.MustPutRegion(re, cluster, 1001, 5, []byte(fmt.Sprintf("%d", 1001)), []byte(fmt.Sprintf("%d", 1002)), core.SetApproximateSize(40)) + tests.MustPutRegion(re, cluster, 1000, 4, []byte(fmt.Sprintf("%d", 1000)), []byte(fmt.Sprintf("%d", 1001)), core.SetApproximateSize(10)) + tests.MustPutRegion(re, cluster, 1001, 5, []byte(fmt.Sprintf("%d", 1001)), []byte(fmt.Sprintf("%d", 1002)), core.SetApproximateSize(40)) time.Sleep(2 * time.Second) output = sendRequest(re, leader.GetAddr()+"/pd/api/v1/stores/progress?action=preparing", http.MethodGet, http.StatusOK) re.NoError(json.Unmarshal(output, &p)) diff --git a/tests/server/apiv2/handlers/keyspace_test.go b/tests/server/apiv2/handlers/keyspace_test.go index 7fd8de013f7..f7b43ab194d 100644 --- a/tests/server/apiv2/handlers/keyspace_test.go +++ b/tests/server/apiv2/handlers/keyspace_test.go @@ -53,7 +53,7 @@ func (suite *keyspaceTestSuite) SetupTest() { suite.NoError(err) suite.NoError(cluster.RunInitialServers()) suite.NotEmpty(cluster.WaitLeader()) - suite.server = cluster.GetServer(cluster.GetLeader()) + suite.server = cluster.GetLeaderServer() suite.NoError(suite.server.BootstrapCluster()) suite.NoError(failpoint.Enable("github.com/tikv/pd/pkg/keyspace/skipSplitRegion", "return(true)")) } diff --git a/tests/server/apiv2/handlers/tso_keyspace_group_test.go b/tests/server/apiv2/handlers/tso_keyspace_group_test.go index 1f0189c532f..214de6e95ef 100644 --- a/tests/server/apiv2/handlers/tso_keyspace_group_test.go +++ b/tests/server/apiv2/handlers/tso_keyspace_group_test.go @@ -45,7 +45,7 @@ func (suite *keyspaceGroupTestSuite) SetupTest() { suite.NoError(err) suite.NoError(cluster.RunInitialServers()) suite.NotEmpty(cluster.WaitLeader()) - suite.server = cluster.GetServer(cluster.GetLeader()) + suite.server = cluster.GetLeaderServer() suite.NoError(suite.server.BootstrapCluster()) } diff --git a/tests/server/cluster/cluster_test.go b/tests/server/cluster/cluster_test.go index f22a754b8bf..e1b04c4ebc1 100644 --- a/tests/server/cluster/cluster_test.go +++ b/tests/server/cluster/cluster_test.go @@ -71,7 +71,7 @@ func TestBootstrap(t *testing.T) { re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() @@ -111,7 +111,7 @@ func TestDamagedRegion(t *testing.T) { re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() bootstrapCluster(re, clusterID, grpcPDClient) @@ -191,7 +191,7 @@ func TestStaleRegion(t *testing.T) { re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() bootstrapCluster(re, clusterID, grpcPDClient) @@ -236,7 +236,7 @@ func TestGetPutConfig(t *testing.T) { re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() bootstrapCluster(re, clusterID, grpcPDClient) @@ -465,7 +465,7 @@ func TestRaftClusterRestart(t *testing.T) { re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() bootstrapCluster(re, clusterID, grpcPDClient) @@ -495,7 +495,7 @@ func TestRaftClusterMultipleRestart(t *testing.T) { re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() bootstrapCluster(re, clusterID, grpcPDClient) @@ -538,7 +538,7 @@ func TestGetPDMembers(t *testing.T) { re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() req := &pdpb.GetMembersRequest{Header: testutil.NewRequestHeader(clusterID)} @@ -582,7 +582,7 @@ func TestStoreVersionChange(t *testing.T) { re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() bootstrapCluster(re, clusterID, grpcPDClient) @@ -620,7 +620,7 @@ func TestConcurrentHandleRegion(t *testing.T) { err = tc.RunInitialServers() re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() bootstrapCluster(re, clusterID, grpcPDClient) @@ -737,7 +737,7 @@ func TestSetScheduleOpt(t *testing.T) { re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() bootstrapCluster(re, clusterID, grpcPDClient) @@ -808,7 +808,7 @@ func TestLoadClusterInfo(t *testing.T) { re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() svr := leaderServer.GetServer() rc := cluster.NewRaftCluster(ctx, svr.ClusterID(), syncer.NewRegionSyncer(svr), svr.GetClient(), svr.GetHTTPClient()) @@ -896,7 +896,7 @@ func TestTiFlashWithPlacementRules(t *testing.T) { err = tc.RunInitialServers() re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() bootstrapCluster(re, clusterID, grpcPDClient) @@ -949,7 +949,7 @@ func TestReplicationModeStatus(t *testing.T) { err = tc.RunInitialServers() re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() req := newBootstrapRequest(clusterID) @@ -1049,7 +1049,7 @@ func TestOfflineStoreLimit(t *testing.T) { err = tc.RunInitialServers() re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() bootstrapCluster(re, clusterID, grpcPDClient) @@ -1141,7 +1141,7 @@ func TestUpgradeStoreLimit(t *testing.T) { err = tc.RunInitialServers() re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() bootstrapCluster(re, clusterID, grpcPDClient) @@ -1199,7 +1199,7 @@ func TestStaleTermHeartbeat(t *testing.T) { err = tc.RunInitialServers() re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() bootstrapCluster(re, clusterID, grpcPDClient) @@ -1334,7 +1334,7 @@ func TestMinResolvedTS(t *testing.T) { err = tc.RunInitialServers() re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() id := leaderServer.GetAllocator() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() @@ -1443,7 +1443,7 @@ func TestTransferLeaderBack(t *testing.T) { err = tc.RunInitialServers() re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() svr := leaderServer.GetServer() rc := cluster.NewRaftCluster(ctx, svr.ClusterID(), syncer.NewRegionSyncer(svr), svr.GetClient(), svr.GetHTTPClient()) rc.InitCluster(svr.GetAllocator(), svr.GetPersistOptions(), svr.GetStorage(), svr.GetBasicCluster(), svr.GetKeyspaceGroupManager()) @@ -1470,7 +1470,7 @@ func TestTransferLeaderBack(t *testing.T) { // transfer PD leader to another PD tc.ResignLeader() tc.WaitLeader() - leaderServer = tc.GetServer(tc.GetLeader()) + leaderServer = tc.GetLeaderServer() svr1 := leaderServer.GetServer() rc1 := svr1.GetRaftCluster() re.NoError(err) @@ -1483,7 +1483,7 @@ func TestTransferLeaderBack(t *testing.T) { // transfer PD leader back to the previous PD tc.ResignLeader() tc.WaitLeader() - leaderServer = tc.GetServer(tc.GetLeader()) + leaderServer = tc.GetLeaderServer() svr = leaderServer.GetServer() rc = svr.GetRaftCluster() re.NotNil(rc) @@ -1503,7 +1503,7 @@ func TestExternalTimestamp(t *testing.T) { err = tc.RunInitialServers() re.NoError(err) tc.WaitLeader() - leaderServer := tc.GetServer(tc.GetLeader()) + leaderServer := tc.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() bootstrapCluster(re, clusterID, grpcPDClient) diff --git a/tests/server/cluster/cluster_work_test.go b/tests/server/cluster/cluster_work_test.go index f0f24ca6777..ef09e522305 100644 --- a/tests/server/cluster/cluster_work_test.go +++ b/tests/server/cluster/cluster_work_test.go @@ -42,7 +42,7 @@ func TestValidRequestRegion(t *testing.T) { re.NoError(err) cluster.WaitLeader() - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() bootstrapCluster(re, clusterID, grpcPDClient) @@ -86,7 +86,7 @@ func TestAskSplit(t *testing.T) { re.NoError(err) cluster.WaitLeader() - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() bootstrapCluster(re, clusterID, grpcPDClient) @@ -143,7 +143,7 @@ func TestSuspectRegions(t *testing.T) { re.NoError(err) cluster.WaitLeader() - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() bootstrapCluster(re, clusterID, grpcPDClient) diff --git a/tests/server/config/config_test.go b/tests/server/config/config_test.go index b9a746b8bed..1b2178bde33 100644 --- a/tests/server/config/config_test.go +++ b/tests/server/config/config_test.go @@ -43,7 +43,7 @@ func TestRateLimitConfigReload(t *testing.T) { defer cluster.Destroy() re.NoError(cluster.RunInitialServers()) re.NotEmpty(cluster.WaitLeader()) - leader := cluster.GetServer(cluster.GetLeader()) + leader := cluster.GetLeaderServer() re.NotNil(leader) re.Empty(leader.GetServer().GetServiceMiddlewareConfig().RateLimitConfig.LimiterConfig) limitCfg := make(map[string]ratelimit.DimensionConfig) @@ -69,7 +69,7 @@ func TestRateLimitConfigReload(t *testing.T) { servers = append(servers, s.GetServer()) } server.MustWaitLeader(re, servers) - leader = cluster.GetServer(cluster.GetLeader()) + leader = cluster.GetLeaderServer() re.NotNil(leader) re.True(leader.GetServer().GetServiceMiddlewarePersistOptions().IsRateLimitEnabled()) re.Len(leader.GetServer().GetServiceMiddlewarePersistOptions().GetRateLimitConfig().LimiterConfig, 1) diff --git a/tests/server/id/id_test.go b/tests/server/id/id_test.go index c4e1c8bb5de..737aa4deac2 100644 --- a/tests/server/id/id_test.go +++ b/tests/server/id/id_test.go @@ -44,7 +44,7 @@ func TestID(t *testing.T) { re.NoError(err) cluster.WaitLeader() - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() var last uint64 for i := uint64(0); i < allocStep; i++ { id, err := leaderServer.GetAllocator().Alloc() @@ -90,7 +90,7 @@ func TestCommand(t *testing.T) { re.NoError(err) cluster.WaitLeader() - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() req := &pdpb.AllocIDRequest{Header: testutil.NewRequestHeader(leaderServer.GetClusterID())} grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) @@ -116,7 +116,7 @@ func TestMonotonicID(t *testing.T) { re.NoError(err) cluster.WaitLeader() - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() var last1 uint64 for i := uint64(0); i < 10; i++ { id, err := leaderServer.GetAllocator().Alloc() @@ -127,7 +127,7 @@ func TestMonotonicID(t *testing.T) { err = cluster.ResignLeader() re.NoError(err) cluster.WaitLeader() - leaderServer = cluster.GetServer(cluster.GetLeader()) + leaderServer = cluster.GetLeaderServer() var last2 uint64 for i := uint64(0); i < 10; i++ { id, err := leaderServer.GetAllocator().Alloc() @@ -138,7 +138,7 @@ func TestMonotonicID(t *testing.T) { err = cluster.ResignLeader() re.NoError(err) cluster.WaitLeader() - leaderServer = cluster.GetServer(cluster.GetLeader()) + leaderServer = cluster.GetLeaderServer() id, err := leaderServer.GetAllocator().Alloc() re.NoError(err) re.Greater(id, last2) @@ -162,7 +162,7 @@ func TestPDRestart(t *testing.T) { err = cluster.RunInitialServers() re.NoError(err) cluster.WaitLeader() - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() var last uint64 for i := uint64(0); i < 10; i++ { diff --git a/tests/server/keyspace/keyspace_test.go b/tests/server/keyspace/keyspace_test.go index a36a7379550..86b8f6fd37c 100644 --- a/tests/server/keyspace/keyspace_test.go +++ b/tests/server/keyspace/keyspace_test.go @@ -59,7 +59,7 @@ func (suite *keyspaceTestSuite) SetupTest() { suite.NoError(err) suite.NoError(cluster.RunInitialServers()) suite.NotEmpty(cluster.WaitLeader()) - suite.server = cluster.GetServer(cluster.GetLeader()) + suite.server = cluster.GetLeaderServer() suite.manager = suite.server.GetKeyspaceManager() suite.NoError(suite.server.BootstrapCluster()) } diff --git a/tests/server/member/member_test.go b/tests/server/member/member_test.go index ca89e66a041..26d4fa2a904 100644 --- a/tests/server/member/member_test.go +++ b/tests/server/member/member_test.go @@ -63,7 +63,7 @@ func TestMemberDelete(t *testing.T) { re.NoError(err) leaderName := cluster.WaitLeader() re.NotEmpty(leaderName) - leader := cluster.GetServer(leaderName) + leader := cluster.GetLeaderServer() var members []*tests.TestServer for _, s := range cluster.GetConfig().InitialServers { if s.Name != leaderName { diff --git a/tests/server/region_syncer/region_syncer_test.go b/tests/server/region_syncer/region_syncer_test.go index afa5c87cdcc..f672f82f1f6 100644 --- a/tests/server/region_syncer/region_syncer_test.go +++ b/tests/server/region_syncer/region_syncer_test.go @@ -57,7 +57,7 @@ func TestRegionSyncer(t *testing.T) { re.NoError(cluster.RunInitialServers()) cluster.WaitLeader() - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) rc := leaderServer.GetServer().GetRaftCluster() re.NotNil(rc) @@ -140,7 +140,7 @@ func TestRegionSyncer(t *testing.T) { err = leaderServer.Stop() re.NoError(err) cluster.WaitLeader() - leaderServer = cluster.GetServer(cluster.GetLeader()) + leaderServer = cluster.GetLeaderServer() re.NotNil(leaderServer) loadRegions := leaderServer.GetServer().GetRaftCluster().GetRegions() re.Len(loadRegions, regionLen) @@ -166,7 +166,7 @@ func TestFullSyncWithAddMember(t *testing.T) { err = cluster.RunInitialServers() re.NoError(err) cluster.WaitLeader() - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) rc := leaderServer.GetServer().GetRaftCluster() re.NotNil(rc) @@ -210,7 +210,7 @@ func TestPrepareChecker(t *testing.T) { err = cluster.RunInitialServers() re.NoError(err) cluster.WaitLeader() - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) rc := leaderServer.GetServer().GetRaftCluster() re.NotNil(rc) @@ -235,7 +235,7 @@ func TestPrepareChecker(t *testing.T) { err = cluster.ResignLeader() re.NoError(err) re.Equal("pd2", cluster.WaitLeader()) - leaderServer = cluster.GetServer(cluster.GetLeader()) + leaderServer = cluster.GetLeaderServer() rc = leaderServer.GetServer().GetRaftCluster() for _, region := range regions { err = rc.HandleRegionHeartbeat(region) diff --git a/tests/server/storage/hot_region_storage_test.go b/tests/server/storage/hot_region_storage_test.go index 21881802d7d..00d0244a790 100644 --- a/tests/server/storage/hot_region_storage_test.go +++ b/tests/server/storage/hot_region_storage_test.go @@ -29,7 +29,6 @@ import ( "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/server/config" "github.com/tikv/pd/tests" - "github.com/tikv/pd/tests/pdctl" ) func TestHotRegionStorage(t *testing.T) { @@ -61,20 +60,20 @@ func TestHotRegionStorage(t *testing.T) { }, } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) for _, store := range stores { - pdctl.MustPutStore(re, leaderServer.GetServer(), store) + tests.MustPutStore(re, cluster, store) } defer cluster.Destroy() startTime := time.Now().Unix() - pdctl.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetWrittenBytes(3000000000), + tests.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetWrittenBytes(3000000000), core.SetReportInterval(uint64(startTime-utils.RegionHeartBeatReportInterval), uint64(startTime))) - pdctl.MustPutRegion(re, cluster, 2, 2, []byte("c"), []byte("d"), core.SetWrittenBytes(6000000000), + tests.MustPutRegion(re, cluster, 2, 2, []byte("c"), []byte("d"), core.SetWrittenBytes(6000000000), core.SetReportInterval(uint64(startTime-utils.RegionHeartBeatReportInterval), uint64(startTime))) - pdctl.MustPutRegion(re, cluster, 3, 1, []byte("e"), []byte("f"), + tests.MustPutRegion(re, cluster, 3, 1, []byte("e"), []byte("f"), core.SetReportInterval(uint64(startTime-utils.RegionHeartBeatReportInterval), uint64(startTime))) - pdctl.MustPutRegion(re, cluster, 4, 2, []byte("g"), []byte("h"), + tests.MustPutRegion(re, cluster, 4, 2, []byte("g"), []byte("h"), core.SetReportInterval(uint64(startTime-utils.RegionHeartBeatReportInterval), uint64(startTime))) storeStats := []*pdpb.StoreStats{ { @@ -169,14 +168,14 @@ func TestHotRegionStorageReservedDayConfigChange(t *testing.T) { }, } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) for _, store := range stores { - pdctl.MustPutStore(re, leaderServer.GetServer(), store) + tests.MustPutStore(re, cluster, store) } defer cluster.Destroy() startTime := time.Now().Unix() - pdctl.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetWrittenBytes(3000000000), + tests.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetWrittenBytes(3000000000), core.SetReportInterval(uint64(startTime-utils.RegionHeartBeatReportInterval), uint64(startTime))) var iter storage.HotRegionStorageIterator var next *storage.HistoryHotRegion @@ -197,7 +196,7 @@ func TestHotRegionStorageReservedDayConfigChange(t *testing.T) { schedule.HotRegionsReservedDays = 0 leaderServer.GetServer().SetScheduleConfig(schedule) time.Sleep(3 * interval) - pdctl.MustPutRegion(re, cluster, 2, 2, []byte("c"), []byte("d"), core.SetWrittenBytes(6000000000), + tests.MustPutRegion(re, cluster, 2, 2, []byte("c"), []byte("d"), core.SetWrittenBytes(6000000000), core.SetReportInterval(uint64(time.Now().Unix()-utils.RegionHeartBeatReportInterval), uint64(time.Now().Unix()))) time.Sleep(10 * interval) hotRegionStorage := leaderServer.GetServer().GetHistoryHotRegionStorage() @@ -261,14 +260,14 @@ func TestHotRegionStorageWriteIntervalConfigChange(t *testing.T) { }, } - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() re.NoError(leaderServer.BootstrapCluster()) for _, store := range stores { - pdctl.MustPutStore(re, leaderServer.GetServer(), store) + tests.MustPutStore(re, cluster, store) } defer cluster.Destroy() startTime := time.Now().Unix() - pdctl.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), + tests.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetWrittenBytes(3000000000), core.SetReportInterval(uint64(startTime-utils.RegionHeartBeatReportInterval), uint64(startTime))) var iter storage.HotRegionStorageIterator @@ -290,7 +289,7 @@ func TestHotRegionStorageWriteIntervalConfigChange(t *testing.T) { schedule.HotRegionsWriteInterval.Duration = 20 * interval leaderServer.GetServer().SetScheduleConfig(schedule) time.Sleep(3 * interval) - pdctl.MustPutRegion(re, cluster, 2, 2, []byte("c"), []byte("d"), core.SetWrittenBytes(6000000000), + tests.MustPutRegion(re, cluster, 2, 2, []byte("c"), []byte("d"), core.SetWrittenBytes(6000000000), core.SetReportInterval(uint64(time.Now().Unix()-utils.RegionHeartBeatReportInterval), uint64(time.Now().Unix()))) time.Sleep(10 * interval) // it cant get new hot region because wait time smaller than hot region write interval diff --git a/tests/server/tso/consistency_test.go b/tests/server/tso/consistency_test.go index db6e2135d2b..9cfadbf5ba3 100644 --- a/tests/server/tso/consistency_test.go +++ b/tests/server/tso/consistency_test.go @@ -79,7 +79,7 @@ func (suite *tsoConsistencyTestSuite) TestSynchronizedGlobalTSO() { re := suite.Require() cluster.WaitAllLeaders(re, dcLocationConfig) - suite.leaderServer = cluster.GetServer(cluster.GetLeader()) + suite.leaderServer = cluster.GetLeaderServer() suite.NotNil(suite.leaderServer) suite.dcClientMap[tso.GlobalDCLocation] = testutil.MustNewGrpcClient(re, suite.leaderServer.GetAddr()) for _, dcLocation := range dcLocationConfig { @@ -154,7 +154,7 @@ func (suite *tsoConsistencyTestSuite) TestSynchronizedGlobalTSOOverflow() { re := suite.Require() cluster.WaitAllLeaders(re, dcLocationConfig) - suite.leaderServer = cluster.GetServer(cluster.GetLeader()) + suite.leaderServer = cluster.GetLeaderServer() suite.NotNil(suite.leaderServer) suite.dcClientMap[tso.GlobalDCLocation] = testutil.MustNewGrpcClient(re, suite.leaderServer.GetAddr()) for _, dcLocation := range dcLocationConfig { @@ -186,7 +186,7 @@ func (suite *tsoConsistencyTestSuite) TestLocalAllocatorLeaderChange() { re := suite.Require() cluster.WaitAllLeaders(re, dcLocationConfig) - suite.leaderServer = cluster.GetServer(cluster.GetLeader()) + suite.leaderServer = cluster.GetLeaderServer() suite.NotNil(suite.leaderServer) suite.dcClientMap[tso.GlobalDCLocation] = testutil.MustNewGrpcClient(re, suite.leaderServer.GetAddr()) for _, dcLocation := range dcLocationConfig { @@ -248,7 +248,7 @@ func (suite *tsoConsistencyTestSuite) TestLocalTSOAfterMemberChanged() { re := suite.Require() cluster.WaitAllLeaders(re, dcLocationConfig) - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() leaderCli := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) req := &pdpb.TsoRequest{ Header: testutil.NewRequestHeader(cluster.GetCluster().GetId()), @@ -286,7 +286,7 @@ func (suite *tsoConsistencyTestSuite) TestLocalTSOAfterMemberChanged() { func (suite *tsoConsistencyTestSuite) testTSO(cluster *tests.TestCluster, dcLocationConfig map[string]string, previousTS *pdpb.Timestamp) { re := suite.Require() - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() dcClientMap := make(map[string]pdpb.PDClient) for _, dcLocation := range dcLocationConfig { pdName := leaderServer.GetAllocatorLeader(dcLocation).GetName() diff --git a/tests/server/tso/global_tso_test.go b/tests/server/tso/global_tso_test.go index a6340e2671c..5ae2e6e0f67 100644 --- a/tests/server/tso/global_tso_test.go +++ b/tests/server/tso/global_tso_test.go @@ -97,7 +97,7 @@ func TestDelaySyncTimestamp(t *testing.T) { cluster.WaitLeader() var leaderServer, nextLeaderServer *tests.TestServer - leaderServer = cluster.GetServer(cluster.GetLeader()) + leaderServer = cluster.GetLeaderServer() re.NotNil(leaderServer) for _, s := range cluster.GetServers() { if s.GetConfig().Name != cluster.GetLeader() { @@ -145,7 +145,7 @@ func TestLogicalOverflow(t *testing.T) { re.NoError(cluster.RunInitialServers()) cluster.WaitLeader() - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() diff --git a/tests/server/tso/tso_test.go b/tests/server/tso/tso_test.go index 48df02a6c27..9eff1192e57 100644 --- a/tests/server/tso/tso_test.go +++ b/tests/server/tso/tso_test.go @@ -76,7 +76,7 @@ func TestLoadTimestamp(t *testing.T) { func requestLocalTSOs(re *require.Assertions, cluster *tests.TestCluster, dcLocationConfig map[string]string) map[string]*pdpb.Timestamp { dcClientMap := make(map[string]pdpb.PDClient) tsMap := make(map[string]*pdpb.Timestamp) - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() for _, dcLocation := range dcLocationConfig { pdName := leaderServer.GetAllocatorLeader(dcLocation).GetName() dcClientMap[dcLocation] = testutil.MustNewGrpcClient(re, cluster.GetServer(pdName).GetAddr()) @@ -125,7 +125,7 @@ func TestDisableLocalTSOAfterEnabling(t *testing.T) { cluster.WaitLeader() // Re-request the global TSOs. - leaderServer := cluster.GetServer(cluster.GetLeader()) + leaderServer := cluster.GetLeaderServer() grpcPDClient := testutil.MustNewGrpcClient(re, leaderServer.GetAddr()) clusterID := leaderServer.GetClusterID() req := &pdpb.TsoRequest{ diff --git a/tests/server/watch/leader_watch_test.go b/tests/server/watch/leader_watch_test.go index 049486ba068..f7765297023 100644 --- a/tests/server/watch/leader_watch_test.go +++ b/tests/server/watch/leader_watch_test.go @@ -42,7 +42,7 @@ func TestWatcher(t *testing.T) { err = cluster.RunInitialServers() re.NoError(err) cluster.WaitLeader() - pd1 := cluster.GetServer(cluster.GetLeader()) + pd1 := cluster.GetLeaderServer() re.NotNil(pd1) pd2, err := cluster.Join(ctx) @@ -80,7 +80,7 @@ func TestWatcherCompacted(t *testing.T) { err = cluster.RunInitialServers() re.NoError(err) cluster.WaitLeader() - pd1 := cluster.GetServer(cluster.GetLeader()) + pd1 := cluster.GetLeaderServer() re.NotNil(pd1) client := pd1.GetEtcdClient() _, err = client.Put(context.Background(), "test", "v") diff --git a/tests/testutil.go b/tests/testutil.go index 53efcff7658..3fd8e9dca35 100644 --- a/tests/testutil.go +++ b/tests/testutil.go @@ -16,19 +16,26 @@ package tests import ( "context" + "fmt" "os" "sync" "time" + "github.com/docker/go-units" + "github.com/pingcap/kvproto/pkg/metapb" + "github.com/pingcap/kvproto/pkg/pdpb" "github.com/pingcap/log" "github.com/stretchr/testify/require" bs "github.com/tikv/pd/pkg/basicserver" + "github.com/tikv/pd/pkg/core" rm "github.com/tikv/pd/pkg/mcs/resourcemanager/server" scheduling "github.com/tikv/pd/pkg/mcs/scheduling/server" sc "github.com/tikv/pd/pkg/mcs/scheduling/server/config" tso "github.com/tikv/pd/pkg/mcs/tso/server" "github.com/tikv/pd/pkg/utils/logutil" "github.com/tikv/pd/pkg/utils/testutil" + "github.com/tikv/pd/pkg/versioninfo" + "github.com/tikv/pd/server" "go.uber.org/zap" ) @@ -148,3 +155,68 @@ func WaitForPrimaryServing(re *require.Assertions, serverMap map[string]bs.Serve return primary } + +// MustPutStore is used for test purpose. +func MustPutStore(re *require.Assertions, cluster *TestCluster, store *metapb.Store) { + store.Address = fmt.Sprintf("tikv%d", store.GetId()) + if len(store.Version) == 0 { + store.Version = versioninfo.MinSupportedVersion(versioninfo.Version2_0).String() + } + svr := cluster.GetLeaderServer().GetServer() + grpcServer := &server.GrpcServer{Server: svr} + _, err := grpcServer.PutStore(context.Background(), &pdpb.PutStoreRequest{ + Header: &pdpb.RequestHeader{ClusterId: svr.ClusterID()}, + Store: store, + }) + re.NoError(err) + + storeInfo := grpcServer.GetRaftCluster().GetStore(store.GetId()) + newStore := storeInfo.Clone(core.SetStoreStats(&pdpb.StoreStats{ + Capacity: uint64(10 * units.GiB), + UsedSize: uint64(9 * units.GiB), + Available: uint64(1 * units.GiB), + })) + grpcServer.GetRaftCluster().GetBasicCluster().PutStore(newStore) + if cluster.GetSchedulingPrimaryServer() != nil { + cluster.GetSchedulingPrimaryServer().GetCluster().PutStore(newStore) + } +} + +// MustPutRegion is used for test purpose. +func MustPutRegion(re *require.Assertions, cluster *TestCluster, regionID, storeID uint64, start, end []byte, opts ...core.RegionCreateOption) *core.RegionInfo { + leader := &metapb.Peer{ + Id: regionID, + StoreId: storeID, + } + metaRegion := &metapb.Region{ + Id: regionID, + StartKey: start, + EndKey: end, + Peers: []*metapb.Peer{leader}, + RegionEpoch: &metapb.RegionEpoch{ConfVer: 1, Version: 1}, + } + r := core.NewRegionInfo(metaRegion, leader, opts...) + err := cluster.HandleRegionHeartbeat(r) + re.NoError(err) + if cluster.GetSchedulingPrimaryServer() != nil { + err = cluster.GetSchedulingPrimaryServer().GetCluster().HandleRegionHeartbeat(r) + re.NoError(err) + } + return r +} + +// MustReportBuckets is used for test purpose. +func MustReportBuckets(re *require.Assertions, cluster *TestCluster, regionID uint64, start, end []byte, stats *metapb.BucketStats) *metapb.Buckets { + buckets := &metapb.Buckets{ + RegionId: regionID, + Version: 1, + Keys: [][]byte{start, end}, + Stats: stats, + // report buckets interval is 10s + PeriodInMs: 10000, + } + err := cluster.HandleReportBuckets(buckets) + re.NoError(err) + // TODO: forwards to scheduling server after it supports buckets + return buckets +} From 47e165bb596651d4e718313191b93c539b33279f Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Mon, 25 Sep 2023 17:33:44 +0800 Subject: [PATCH 13/14] TestRequest Signed-off-by: lhy1024 --- tests/pdctl/operator/operator_test.go | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/pdctl/operator/operator_test.go b/tests/pdctl/operator/operator_test.go index a95c620adcf..538dec9cb92 100644 --- a/tests/pdctl/operator/operator_test.go +++ b/tests/pdctl/operator/operator_test.go @@ -281,3 +281,24 @@ func TestForwardOperatorRequest(t *testing.T) { re.NoError(err) re.Contains(string(output), "null") } + +func TestRequest(t *testing.T) { + re := require.New(t) + for i := 0; i < 100; i++ { + ctx, cancel := context.WithCancel(context.Background()) + cluster, err := tests.NewTestCluster(ctx, 1) + re.NoError(err) + re.NoError(cluster.RunInitialServers()) + re.NotEmpty(cluster.WaitLeader()) + server := cluster.GetLeaderServer() + re.NoError(server.BootstrapCluster()) + backendEndpoints := server.GetAddr() + + cmd := pdctlCmd.GetRootCmd() + args := []string{"-u", backendEndpoints, "operator", "check", "2"} + output, err := pdctl.ExecuteCommand(cmd, args...) + re.NoError(err) + re.Contains(string(output), "operator not found") + cancel() + } +} From 12b1c9dd4798bc055881bfddd460ef99dc9ffd0b Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Mon, 25 Sep 2023 17:58:07 +0800 Subject: [PATCH 14/14] test Signed-off-by: lhy1024 --- tests/pdctl/operator/operator_test.go | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/pdctl/operator/operator_test.go b/tests/pdctl/operator/operator_test.go index 538dec9cb92..b2d5390a205 100644 --- a/tests/pdctl/operator/operator_test.go +++ b/tests/pdctl/operator/operator_test.go @@ -292,13 +292,6 @@ func TestRequest(t *testing.T) { re.NotEmpty(cluster.WaitLeader()) server := cluster.GetLeaderServer() re.NoError(server.BootstrapCluster()) - backendEndpoints := server.GetAddr() - - cmd := pdctlCmd.GetRootCmd() - args := []string{"-u", backendEndpoints, "operator", "check", "2"} - output, err := pdctl.ExecuteCommand(cmd, args...) - re.NoError(err) - re.Contains(string(output), "operator not found") cancel() } }