Skip to content

Commit

Permalink
upstream: exclude draining hosts from eds (envoyproxy#30622)
Browse files Browse the repository at this point in the history
* Exclude hosts in DRAINING state

Signed-off-by: Networking team <[email protected]>

* WIP

Signed-off-by: Networking team <[email protected]>

* fix upstream_impl_test

Signed-off-by: Networking team <[email protected]>

* add more tests

Signed-off-by: Networking team <[email protected]>

* fix lint

Signed-off-by: Networking team <[email protected]>

* fix cluster_handler_test

Signed-off-by: Networking team <[email protected]>

* fix lint

Signed-off-by: Networking team <[email protected]>

* rm excluded_via_eds_draining

Signed-off-by: Chao-Han Tsai <[email protected]>

* rename

Signed-off-by: Chao-Han Tsai <[email protected]>

* restore

Signed-off-by: Chao-Han Tsai <[email protected]>

* remove comment

Signed-off-by: Chao-Han Tsai <[email protected]>

* add runtime guard and changelog

Signed-off-by: Networking team <[email protected]>

* fix lint

Signed-off-by: Networking team <[email protected]>

---------

Signed-off-by: Networking team <[email protected]>
Signed-off-by: Chao-Han Tsai <[email protected]>
Co-authored-by: Networking team <[email protected]>
  • Loading branch information
milton0825 and Networking team authored Feb 3, 2024
1 parent 0053634 commit 12e928c
Show file tree
Hide file tree
Showing 10 changed files with 48 additions and 14 deletions.
5 changes: 5 additions & 0 deletions changelogs/current.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ minor_behavior_changes:
- area: adaptive concurrency filter stats
change: |
Multiply the gradient value stat by 1000 to make it more granular (values will range between 500 and 2000).
- area: upstream
change: |
Upstream now excludes hosts set to ``DRAINING`` state via EDS from load balancing and panic routing
threshold calculation. This feature can be disabled by setting
``envoy.reloadable_features.exclude_host_in_eds_status_draining`` to false.
bug_fixes:
# *Changes expected to improve the state of the world and are unlikely to have negative effects*
Expand Down
4 changes: 3 additions & 1 deletion envoy/upstream/upstream.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,9 @@ class Host : virtual public HostDescription {
/* is not meant to be routed to. */ \
m(EXCLUDED_VIA_IMMEDIATE_HC_FAIL, 0x80) \
/* The host failed active HC due to timeout. */ \
m(ACTIVE_HC_TIMEOUT, 0x100)
m(ACTIVE_HC_TIMEOUT, 0x100) \
/* The host is currently marked as draining by EDS */ \
m(EDS_STATUS_DRAINING, 0x200)
// clang-format on

#define DECLARE_ENUM(name, value) name = value,
Expand Down
1 change: 1 addition & 0 deletions source/common/runtime/runtime_features.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ RUNTIME_GUARD(envoy_reloadable_features_enable_compression_bomb_protection);
RUNTIME_GUARD(envoy_reloadable_features_enable_connect_udp_support);
RUNTIME_GUARD(envoy_reloadable_features_enable_intermediate_ca);
RUNTIME_GUARD(envoy_reloadable_features_enable_zone_routing_different_zone_counts);
RUNTIME_GUARD(envoy_reloadable_features_exclude_host_in_eds_status_draining);
RUNTIME_GUARD(envoy_reloadable_features_ext_authz_http_send_original_xff);
RUNTIME_GUARD(envoy_reloadable_features_grpc_http1_reverse_bridge_change_http_status);
RUNTIME_GUARD(envoy_reloadable_features_grpc_http1_reverse_bridge_handle_empty_response);
Expand Down
7 changes: 7 additions & 0 deletions source/common/upstream/host_utility.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@ void setHealthFlag(Upstream::Host::HealthFlag flag, const Host& host, std::strin
}
break;
}

case Host::HealthFlag::EDS_STATUS_DRAINING: {
if (host.healthFlagGet(Host::HealthFlag::EDS_STATUS_DRAINING)) {
health_status += "/eds_status_draining";
}
break;
}
}
}

Expand Down
15 changes: 12 additions & 3 deletions source/common/upstream/upstream_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -451,16 +451,24 @@ void HostImpl::setEdsHealthFlag(envoy::config::core::v3::HealthStatus health_sta
// Clear all old EDS health flags first.
HostImpl::healthFlagClear(Host::HealthFlag::FAILED_EDS_HEALTH);
HostImpl::healthFlagClear(Host::HealthFlag::DEGRADED_EDS_HEALTH);
if (Runtime::runtimeFeatureEnabled(
"envoy.reloadable_features.exclude_host_in_eds_status_draining")) {
HostImpl::healthFlagClear(Host::HealthFlag::EDS_STATUS_DRAINING);
}

// Set the appropriate EDS health flag.
switch (health_status) {
case envoy::config::core::v3::UNHEALTHY:
FALLTHRU;
case envoy::config::core::v3::DRAINING:
FALLTHRU;
case envoy::config::core::v3::TIMEOUT:
HostImpl::healthFlagSet(Host::HealthFlag::FAILED_EDS_HEALTH);
break;
case envoy::config::core::v3::DRAINING:
if (Runtime::runtimeFeatureEnabled(
"envoy.reloadable_features.exclude_host_in_eds_status_draining")) {
HostImpl::healthFlagSet(Host::HealthFlag::EDS_STATUS_DRAINING);
}
break;
case envoy::config::core::v3::DEGRADED:
HostImpl::healthFlagSet(Host::HealthFlag::DEGRADED_EDS_HEALTH);
break;
Expand Down Expand Up @@ -1575,7 +1583,8 @@ namespace {

bool excludeBasedOnHealthFlag(const Host& host) {
return host.healthFlagGet(Host::HealthFlag::PENDING_ACTIVE_HC) ||
host.healthFlagGet(Host::HealthFlag::EXCLUDED_VIA_IMMEDIATE_HC_FAIL);
host.healthFlagGet(Host::HealthFlag::EXCLUDED_VIA_IMMEDIATE_HC_FAIL) ||
host.healthFlagGet(Host::HealthFlag::EDS_STATUS_DRAINING);
}

} // namespace
Expand Down
3 changes: 2 additions & 1 deletion source/common/upstream/upstream_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,8 @@ class HostImpl : public HostDescriptionImpl,
// If any of the unhealthy flags are set, host is unhealthy.
if (healthFlagsGet(enumToInt(HealthFlag::FAILED_ACTIVE_HC) |
enumToInt(HealthFlag::FAILED_OUTLIER_CHECK) |
enumToInt(HealthFlag::FAILED_EDS_HEALTH))) {
enumToInt(HealthFlag::FAILED_EDS_HEALTH) |
enumToInt(HealthFlag::EDS_STATUS_DRAINING))) {
return Host::Health::Unhealthy;
}

Expand Down
3 changes: 3 additions & 0 deletions source/server/admin/clusters_handler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ void setHealthFlag(Upstream::Host::HealthFlag flag, const Upstream::Host& host,
health_status.set_active_hc_timeout(
host.healthFlagGet(Upstream::Host::HealthFlag::ACTIVE_HC_TIMEOUT));
break;
case Upstream::Host::HealthFlag::EDS_STATUS_DRAINING:
health_status.set_eds_health_status(envoy::config::core::v3::DRAINING);
break;
}
}

Expand Down
2 changes: 1 addition & 1 deletion test/common/upstream/host_utility_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ TEST(HostUtilityTest, All) {
#undef SET_HEALTH_FLAG
EXPECT_EQ("/failed_active_hc/failed_outlier_check/failed_eds_health/degraded_active_hc/"
"degraded_eds_health/pending_dynamic_removal/pending_active_hc/"
"excluded_via_immediate_hc_fail/active_hc_timeout",
"excluded_via_immediate_hc_fail/active_hc_timeout/eds_status_draining",
HostUtility::healthFlagsToString(*host));
}

Expand Down
16 changes: 10 additions & 6 deletions test/common/upstream/upstream_impl_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1783,7 +1783,7 @@ TEST_F(HostImplTest, HealthStatus) {
EXPECT_EQ(Host::HealthStatus::DRAINING, host->edsHealthStatus());
EXPECT_EQ(Host::Health::Unhealthy, host->coarseHealth());
// Old EDS flags should be cleared and new flag will be set.
EXPECT_TRUE(host->healthFlagGet(Host::HealthFlag::FAILED_EDS_HEALTH));
EXPECT_TRUE(host->healthFlagGet(Host::HealthFlag::EDS_STATUS_DRAINING));

// Setting an active unhealthy flag make the host unhealthy.
host->healthFlagSet(Host::HealthFlag::FAILED_ACTIVE_HC);
Expand Down Expand Up @@ -5821,6 +5821,7 @@ TEST(HostPartitionTest, PartitionHosts) {
makeTestHost(info, "tcp://127.0.0.1:81", *time_source, zone_a),
makeTestHost(info, "tcp://127.0.0.1:82", *time_source, zone_b),
makeTestHost(info, "tcp://127.0.0.1:83", *time_source, zone_b),
makeTestHost(info, "tcp://127.0.0.1:84", *time_source, zone_b),
makeTestHost(info, "tcp://127.0.0.1:84", *time_source, zone_b)};

hosts[0]->healthFlagSet(Host::HealthFlag::FAILED_ACTIVE_HC);
Expand All @@ -5829,24 +5830,26 @@ TEST(HostPartitionTest, PartitionHosts) {
hosts[2]->healthFlagSet(Host::HealthFlag::FAILED_ACTIVE_HC);
hosts[4]->healthFlagSet(Host::HealthFlag::EXCLUDED_VIA_IMMEDIATE_HC_FAIL);
hosts[4]->healthFlagSet(Host::HealthFlag::FAILED_ACTIVE_HC);
hosts[5]->healthFlagSet(Host::HealthFlag::EDS_STATUS_DRAINING);

auto hosts_per_locality =
makeHostsPerLocality({{hosts[0], hosts[1]}, {hosts[2], hosts[3], hosts[4]}});
makeHostsPerLocality({{hosts[0], hosts[1]}, {hosts[2], hosts[3], hosts[4], hosts[5]}});

auto update_hosts_params =
HostSetImpl::partitionHosts(std::make_shared<const HostVector>(hosts), hosts_per_locality);

EXPECT_EQ(5, update_hosts_params.hosts->size());
EXPECT_EQ(6, update_hosts_params.hosts->size());
EXPECT_EQ(1, update_hosts_params.healthy_hosts->get().size());
EXPECT_EQ(hosts[3], update_hosts_params.healthy_hosts->get()[0]);
EXPECT_EQ(1, update_hosts_params.degraded_hosts->get().size());
EXPECT_EQ(hosts[1], update_hosts_params.degraded_hosts->get()[0]);
EXPECT_EQ(2, update_hosts_params.excluded_hosts->get().size());
EXPECT_EQ(3, update_hosts_params.excluded_hosts->get().size());
EXPECT_EQ(hosts[2], update_hosts_params.excluded_hosts->get()[0]);
EXPECT_EQ(hosts[4], update_hosts_params.excluded_hosts->get()[1]);
EXPECT_EQ(hosts[5], update_hosts_params.excluded_hosts->get()[2]);

EXPECT_EQ(2, update_hosts_params.hosts_per_locality->get()[0].size());
EXPECT_EQ(3, update_hosts_params.hosts_per_locality->get()[1].size());
EXPECT_EQ(4, update_hosts_params.hosts_per_locality->get()[1].size());

EXPECT_EQ(0, update_hosts_params.healthy_hosts_per_locality->get()[0].size());
EXPECT_EQ(1, update_hosts_params.healthy_hosts_per_locality->get()[1].size());
Expand All @@ -5857,9 +5860,10 @@ TEST(HostPartitionTest, PartitionHosts) {
EXPECT_EQ(hosts[1], update_hosts_params.degraded_hosts_per_locality->get()[0][0]);

EXPECT_EQ(0, update_hosts_params.excluded_hosts_per_locality->get()[0].size());
EXPECT_EQ(2, update_hosts_params.excluded_hosts_per_locality->get()[1].size());
EXPECT_EQ(3, update_hosts_params.excluded_hosts_per_locality->get()[1].size());
EXPECT_EQ(hosts[2], update_hosts_params.excluded_hosts_per_locality->get()[1][0]);
EXPECT_EQ(hosts[4], update_hosts_params.excluded_hosts_per_locality->get()[1][1]);
EXPECT_EQ(hosts[5], update_hosts_params.excluded_hosts_per_locality->get()[1][2]);
}

TEST_F(ClusterInfoImplTest, MaxRequestsPerConnectionValidation) {
Expand Down
6 changes: 4 additions & 2 deletions test/server/admin/clusters_handler_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ TEST_P(AdminInstanceTest, ClustersJsonAndText) {
.WillByDefault(Return(true));
ON_CALL(*host, healthFlagGet(Upstream::Host::HealthFlag::FAILED_EDS_HEALTH))
.WillByDefault(Return(false));
ON_CALL(*host, healthFlagGet(Upstream::Host::HealthFlag::EDS_STATUS_DRAINING))
.WillByDefault(Return(true));
ON_CALL(*host, healthFlagGet(Upstream::Host::HealthFlag::DEGRADED_ACTIVE_HC))
.WillByDefault(Return(true));
ON_CALL(*host, healthFlagGet(Upstream::Host::HealthFlag::DEGRADED_EDS_HEALTH))
Expand Down Expand Up @@ -171,7 +173,7 @@ TEST_P(AdminInstanceTest, ClustersJsonAndText) {
}
],
"health_status": {
"eds_health_status": "DEGRADED",
"eds_health_status": "DRAINING",
"failed_active_health_check": true,
"failed_outlier_check": true,
"failed_active_degraded_check": true,
Expand Down Expand Up @@ -228,7 +230,7 @@ fake_cluster::1.2.3.4:80::rest_counter::10
fake_cluster::1.2.3.4:80::test_counter::10
fake_cluster::1.2.3.4:80::test_gauge::11
fake_cluster::1.2.3.4:80::hostname::foo.com
fake_cluster::1.2.3.4:80::health_flags::/failed_active_hc/failed_outlier_check/degraded_active_hc/degraded_eds_health/pending_dynamic_removal
fake_cluster::1.2.3.4:80::health_flags::/failed_active_hc/failed_outlier_check/degraded_active_hc/degraded_eds_health/pending_dynamic_removal/eds_status_draining
fake_cluster::1.2.3.4:80::weight::5
fake_cluster::1.2.3.4:80::region::test_region
fake_cluster::1.2.3.4:80::zone::test_zone
Expand Down

0 comments on commit 12e928c

Please sign in to comment.