diff --git a/changelogs/current.yaml b/changelogs/current.yaml index e8515c3cf114..9500976feca7 100644 --- a/changelogs/current.yaml +++ b/changelogs/current.yaml @@ -8,6 +8,11 @@ minor_behavior_changes: - area: adaptive concurrency filter stats change: | Multiply the gradient value stat by 1000 to make it more granular (values will range between 500 and 2000). +- area: upstream + change: | + Upstream now excludes hosts set to ``DRAINING`` state via EDS from load balancing and panic routing + threshold calculation. This feature can be disabled by setting + ``envoy.reloadable_features.exclude_host_in_eds_status_draining`` to false. bug_fixes: # *Changes expected to improve the state of the world and are unlikely to have negative effects* diff --git a/envoy/upstream/upstream.h b/envoy/upstream/upstream.h index c5cf57e586ee..2bb5739e13b2 100644 --- a/envoy/upstream/upstream.h +++ b/envoy/upstream/upstream.h @@ -149,7 +149,9 @@ class Host : virtual public HostDescription { /* is not meant to be routed to. */ \ m(EXCLUDED_VIA_IMMEDIATE_HC_FAIL, 0x80) \ /* The host failed active HC due to timeout. */ \ - m(ACTIVE_HC_TIMEOUT, 0x100) + m(ACTIVE_HC_TIMEOUT, 0x100) \ + /* The host is currently marked as draining by EDS */ \ + m(EDS_STATUS_DRAINING, 0x200) // clang-format on #define DECLARE_ENUM(name, value) name = value, diff --git a/source/common/runtime/runtime_features.cc b/source/common/runtime/runtime_features.cc index cbce6738f1fe..56cb066b684a 100644 --- a/source/common/runtime/runtime_features.cc +++ b/source/common/runtime/runtime_features.cc @@ -45,6 +45,7 @@ RUNTIME_GUARD(envoy_reloadable_features_enable_compression_bomb_protection); RUNTIME_GUARD(envoy_reloadable_features_enable_connect_udp_support); RUNTIME_GUARD(envoy_reloadable_features_enable_intermediate_ca); RUNTIME_GUARD(envoy_reloadable_features_enable_zone_routing_different_zone_counts); +RUNTIME_GUARD(envoy_reloadable_features_exclude_host_in_eds_status_draining); RUNTIME_GUARD(envoy_reloadable_features_ext_authz_http_send_original_xff); RUNTIME_GUARD(envoy_reloadable_features_grpc_http1_reverse_bridge_change_http_status); RUNTIME_GUARD(envoy_reloadable_features_grpc_http1_reverse_bridge_handle_empty_response); diff --git a/source/common/upstream/host_utility.cc b/source/common/upstream/host_utility.cc index 341b0674d815..e0791aa04252 100644 --- a/source/common/upstream/host_utility.cc +++ b/source/common/upstream/host_utility.cc @@ -73,6 +73,13 @@ void setHealthFlag(Upstream::Host::HealthFlag flag, const Host& host, std::strin } break; } + + case Host::HealthFlag::EDS_STATUS_DRAINING: { + if (host.healthFlagGet(Host::HealthFlag::EDS_STATUS_DRAINING)) { + health_status += "/eds_status_draining"; + } + break; + } } } diff --git a/source/common/upstream/upstream_impl.cc b/source/common/upstream/upstream_impl.cc index b3e3fd94f86a..fa5db9c199ca 100644 --- a/source/common/upstream/upstream_impl.cc +++ b/source/common/upstream/upstream_impl.cc @@ -451,16 +451,24 @@ void HostImpl::setEdsHealthFlag(envoy::config::core::v3::HealthStatus health_sta // Clear all old EDS health flags first. HostImpl::healthFlagClear(Host::HealthFlag::FAILED_EDS_HEALTH); HostImpl::healthFlagClear(Host::HealthFlag::DEGRADED_EDS_HEALTH); + if (Runtime::runtimeFeatureEnabled( + "envoy.reloadable_features.exclude_host_in_eds_status_draining")) { + HostImpl::healthFlagClear(Host::HealthFlag::EDS_STATUS_DRAINING); + } // Set the appropriate EDS health flag. switch (health_status) { case envoy::config::core::v3::UNHEALTHY: FALLTHRU; - case envoy::config::core::v3::DRAINING: - FALLTHRU; case envoy::config::core::v3::TIMEOUT: HostImpl::healthFlagSet(Host::HealthFlag::FAILED_EDS_HEALTH); break; + case envoy::config::core::v3::DRAINING: + if (Runtime::runtimeFeatureEnabled( + "envoy.reloadable_features.exclude_host_in_eds_status_draining")) { + HostImpl::healthFlagSet(Host::HealthFlag::EDS_STATUS_DRAINING); + } + break; case envoy::config::core::v3::DEGRADED: HostImpl::healthFlagSet(Host::HealthFlag::DEGRADED_EDS_HEALTH); break; @@ -1575,7 +1583,8 @@ namespace { bool excludeBasedOnHealthFlag(const Host& host) { return host.healthFlagGet(Host::HealthFlag::PENDING_ACTIVE_HC) || - host.healthFlagGet(Host::HealthFlag::EXCLUDED_VIA_IMMEDIATE_HC_FAIL); + host.healthFlagGet(Host::HealthFlag::EXCLUDED_VIA_IMMEDIATE_HC_FAIL) || + host.healthFlagGet(Host::HealthFlag::EDS_STATUS_DRAINING); } } // namespace diff --git a/source/common/upstream/upstream_impl.h b/source/common/upstream/upstream_impl.h index 7de595e683a8..c17a70ea1a0d 100644 --- a/source/common/upstream/upstream_impl.h +++ b/source/common/upstream/upstream_impl.h @@ -409,7 +409,8 @@ class HostImpl : public HostDescriptionImpl, // If any of the unhealthy flags are set, host is unhealthy. if (healthFlagsGet(enumToInt(HealthFlag::FAILED_ACTIVE_HC) | enumToInt(HealthFlag::FAILED_OUTLIER_CHECK) | - enumToInt(HealthFlag::FAILED_EDS_HEALTH))) { + enumToInt(HealthFlag::FAILED_EDS_HEALTH) | + enumToInt(HealthFlag::EDS_STATUS_DRAINING))) { return Host::Health::Unhealthy; } diff --git a/source/server/admin/clusters_handler.cc b/source/server/admin/clusters_handler.cc index d1ca2c2b285e..5b05de26a27c 100644 --- a/source/server/admin/clusters_handler.cc +++ b/source/server/admin/clusters_handler.cc @@ -99,6 +99,9 @@ void setHealthFlag(Upstream::Host::HealthFlag flag, const Upstream::Host& host, health_status.set_active_hc_timeout( host.healthFlagGet(Upstream::Host::HealthFlag::ACTIVE_HC_TIMEOUT)); break; + case Upstream::Host::HealthFlag::EDS_STATUS_DRAINING: + health_status.set_eds_health_status(envoy::config::core::v3::DRAINING); + break; } } diff --git a/test/common/upstream/host_utility_test.cc b/test/common/upstream/host_utility_test.cc index e0df8d442fd6..af6580e41f38 100644 --- a/test/common/upstream/host_utility_test.cc +++ b/test/common/upstream/host_utility_test.cc @@ -61,7 +61,7 @@ TEST(HostUtilityTest, All) { #undef SET_HEALTH_FLAG EXPECT_EQ("/failed_active_hc/failed_outlier_check/failed_eds_health/degraded_active_hc/" "degraded_eds_health/pending_dynamic_removal/pending_active_hc/" - "excluded_via_immediate_hc_fail/active_hc_timeout", + "excluded_via_immediate_hc_fail/active_hc_timeout/eds_status_draining", HostUtility::healthFlagsToString(*host)); } diff --git a/test/common/upstream/upstream_impl_test.cc b/test/common/upstream/upstream_impl_test.cc index ad7a00d3b824..aa388bb83dd9 100644 --- a/test/common/upstream/upstream_impl_test.cc +++ b/test/common/upstream/upstream_impl_test.cc @@ -1783,7 +1783,7 @@ TEST_F(HostImplTest, HealthStatus) { EXPECT_EQ(Host::HealthStatus::DRAINING, host->edsHealthStatus()); EXPECT_EQ(Host::Health::Unhealthy, host->coarseHealth()); // Old EDS flags should be cleared and new flag will be set. - EXPECT_TRUE(host->healthFlagGet(Host::HealthFlag::FAILED_EDS_HEALTH)); + EXPECT_TRUE(host->healthFlagGet(Host::HealthFlag::EDS_STATUS_DRAINING)); // Setting an active unhealthy flag make the host unhealthy. host->healthFlagSet(Host::HealthFlag::FAILED_ACTIVE_HC); @@ -5821,6 +5821,7 @@ TEST(HostPartitionTest, PartitionHosts) { makeTestHost(info, "tcp://127.0.0.1:81", *time_source, zone_a), makeTestHost(info, "tcp://127.0.0.1:82", *time_source, zone_b), makeTestHost(info, "tcp://127.0.0.1:83", *time_source, zone_b), + makeTestHost(info, "tcp://127.0.0.1:84", *time_source, zone_b), makeTestHost(info, "tcp://127.0.0.1:84", *time_source, zone_b)}; hosts[0]->healthFlagSet(Host::HealthFlag::FAILED_ACTIVE_HC); @@ -5829,24 +5830,26 @@ TEST(HostPartitionTest, PartitionHosts) { hosts[2]->healthFlagSet(Host::HealthFlag::FAILED_ACTIVE_HC); hosts[4]->healthFlagSet(Host::HealthFlag::EXCLUDED_VIA_IMMEDIATE_HC_FAIL); hosts[4]->healthFlagSet(Host::HealthFlag::FAILED_ACTIVE_HC); + hosts[5]->healthFlagSet(Host::HealthFlag::EDS_STATUS_DRAINING); auto hosts_per_locality = - makeHostsPerLocality({{hosts[0], hosts[1]}, {hosts[2], hosts[3], hosts[4]}}); + makeHostsPerLocality({{hosts[0], hosts[1]}, {hosts[2], hosts[3], hosts[4], hosts[5]}}); auto update_hosts_params = HostSetImpl::partitionHosts(std::make_shared(hosts), hosts_per_locality); - EXPECT_EQ(5, update_hosts_params.hosts->size()); + EXPECT_EQ(6, update_hosts_params.hosts->size()); EXPECT_EQ(1, update_hosts_params.healthy_hosts->get().size()); EXPECT_EQ(hosts[3], update_hosts_params.healthy_hosts->get()[0]); EXPECT_EQ(1, update_hosts_params.degraded_hosts->get().size()); EXPECT_EQ(hosts[1], update_hosts_params.degraded_hosts->get()[0]); - EXPECT_EQ(2, update_hosts_params.excluded_hosts->get().size()); + EXPECT_EQ(3, update_hosts_params.excluded_hosts->get().size()); EXPECT_EQ(hosts[2], update_hosts_params.excluded_hosts->get()[0]); EXPECT_EQ(hosts[4], update_hosts_params.excluded_hosts->get()[1]); + EXPECT_EQ(hosts[5], update_hosts_params.excluded_hosts->get()[2]); EXPECT_EQ(2, update_hosts_params.hosts_per_locality->get()[0].size()); - EXPECT_EQ(3, update_hosts_params.hosts_per_locality->get()[1].size()); + EXPECT_EQ(4, update_hosts_params.hosts_per_locality->get()[1].size()); EXPECT_EQ(0, update_hosts_params.healthy_hosts_per_locality->get()[0].size()); EXPECT_EQ(1, update_hosts_params.healthy_hosts_per_locality->get()[1].size()); @@ -5857,9 +5860,10 @@ TEST(HostPartitionTest, PartitionHosts) { EXPECT_EQ(hosts[1], update_hosts_params.degraded_hosts_per_locality->get()[0][0]); EXPECT_EQ(0, update_hosts_params.excluded_hosts_per_locality->get()[0].size()); - EXPECT_EQ(2, update_hosts_params.excluded_hosts_per_locality->get()[1].size()); + EXPECT_EQ(3, update_hosts_params.excluded_hosts_per_locality->get()[1].size()); EXPECT_EQ(hosts[2], update_hosts_params.excluded_hosts_per_locality->get()[1][0]); EXPECT_EQ(hosts[4], update_hosts_params.excluded_hosts_per_locality->get()[1][1]); + EXPECT_EQ(hosts[5], update_hosts_params.excluded_hosts_per_locality->get()[1][2]); } TEST_F(ClusterInfoImplTest, MaxRequestsPerConnectionValidation) { diff --git a/test/server/admin/clusters_handler_test.cc b/test/server/admin/clusters_handler_test.cc index ac88226f454e..389e713d1e19 100644 --- a/test/server/admin/clusters_handler_test.cc +++ b/test/server/admin/clusters_handler_test.cc @@ -80,6 +80,8 @@ TEST_P(AdminInstanceTest, ClustersJsonAndText) { .WillByDefault(Return(true)); ON_CALL(*host, healthFlagGet(Upstream::Host::HealthFlag::FAILED_EDS_HEALTH)) .WillByDefault(Return(false)); + ON_CALL(*host, healthFlagGet(Upstream::Host::HealthFlag::EDS_STATUS_DRAINING)) + .WillByDefault(Return(true)); ON_CALL(*host, healthFlagGet(Upstream::Host::HealthFlag::DEGRADED_ACTIVE_HC)) .WillByDefault(Return(true)); ON_CALL(*host, healthFlagGet(Upstream::Host::HealthFlag::DEGRADED_EDS_HEALTH)) @@ -171,7 +173,7 @@ TEST_P(AdminInstanceTest, ClustersJsonAndText) { } ], "health_status": { - "eds_health_status": "DEGRADED", + "eds_health_status": "DRAINING", "failed_active_health_check": true, "failed_outlier_check": true, "failed_active_degraded_check": true, @@ -228,7 +230,7 @@ fake_cluster::1.2.3.4:80::rest_counter::10 fake_cluster::1.2.3.4:80::test_counter::10 fake_cluster::1.2.3.4:80::test_gauge::11 fake_cluster::1.2.3.4:80::hostname::foo.com -fake_cluster::1.2.3.4:80::health_flags::/failed_active_hc/failed_outlier_check/degraded_active_hc/degraded_eds_health/pending_dynamic_removal +fake_cluster::1.2.3.4:80::health_flags::/failed_active_hc/failed_outlier_check/degraded_active_hc/degraded_eds_health/pending_dynamic_removal/eds_status_draining fake_cluster::1.2.3.4:80::weight::5 fake_cluster::1.2.3.4:80::region::test_region fake_cluster::1.2.3.4:80::zone::test_zone