From f8b2a81581fc9306442be889d21123e9e68fb659 Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Fri, 30 Aug 2024 13:04:50 -0700 Subject: [PATCH] Introduce allocation filter to control placement of search only replicas Signed-off-by: Marc Handalian --- CHANGELOG.md | 1 + .../SearchReplicaFilteringAllocationIT.java | 125 ++++++++++++++++++ .../SearchOnlyReplicaFeatureFlagIT.java | 12 ++ .../decider/FilterAllocationDecider.java | 53 +++++++- .../common/settings/ClusterSettings.java | 4 +- .../decider/FilterAllocationDeciderTests.java | 99 ++++++++++++++ 6 files changed, 292 insertions(+), 2 deletions(-) create mode 100644 server/src/internalClusterTest/java/org/opensearch/cluster/allocation/SearchReplicaFilteringAllocationIT.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 9381aeee9865f..d75a480c18e07 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Adding translog durability validation in index templates ([#15494](https://github.com/opensearch-project/OpenSearch/pull/15494)) - Add index creation using the context field ([#15290](https://github.com/opensearch-project/OpenSearch/pull/15290)) - [Reader Writer Separation] Add searchOnly replica routing configuration ([#15410](https://github.com/opensearch-project/OpenSearch/pull/15410)) +- [Reader Writer Separation] Add allocation filter for search replicas ([#15455](https://github.com/opensearch-project/OpenSearch/pull/15455)) ### Dependencies - Bump `netty` from 4.1.111.Final to 4.1.112.Final ([#15081](https://github.com/opensearch-project/OpenSearch/pull/15081)) diff --git a/server/src/internalClusterTest/java/org/opensearch/cluster/allocation/SearchReplicaFilteringAllocationIT.java b/server/src/internalClusterTest/java/org/opensearch/cluster/allocation/SearchReplicaFilteringAllocationIT.java new file mode 100644 index 0000000000000..1f4694c6815c1 --- /dev/null +++ b/server/src/internalClusterTest/java/org/opensearch/cluster/allocation/SearchReplicaFilteringAllocationIT.java @@ -0,0 +1,125 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.cluster.allocation; + +import org.opensearch.cluster.metadata.IndexMetadata; +import org.opensearch.cluster.routing.IndexShardRoutingTable; +import org.opensearch.cluster.routing.ShardRouting; +import org.opensearch.common.settings.Settings; +import org.opensearch.common.util.FeatureFlags; +import org.opensearch.indices.replication.common.ReplicationType; +import org.opensearch.test.OpenSearchIntegTestCase; + +import java.util.List; +import java.util.stream.Collectors; + +import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_REPLICATION_TYPE; +import static org.opensearch.cluster.routing.allocation.decider.FilterAllocationDecider.SEARCH_REPLICA_ROUTING_INCLUDE_GROUP_SETTING; + +@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, numDataNodes = 0) +public class SearchReplicaFilteringAllocationIT extends OpenSearchIntegTestCase { + + @Override + protected Settings featureFlagSettings() { + return Settings.builder().put(super.featureFlagSettings()).put(FeatureFlags.READER_WRITER_SPLIT_EXPERIMENTAL, Boolean.TRUE).build(); + } + + public void testSearchReplicaDedicatedIncludes() { + List nodesIds = internalCluster().startNodes(3); + final String node_0 = nodesIds.get(0); + final String node_1 = nodesIds.get(1); + final String node_2 = nodesIds.get(2); + assertEquals(3, cluster().size()); + + client().admin() + .cluster() + .prepareUpdateSettings() + .setTransientSettings( + Settings.builder().put(SEARCH_REPLICA_ROUTING_INCLUDE_GROUP_SETTING.getKey() + "_name", node_1 + "," + node_0) + ) + .execute() + .actionGet(); + + createIndex( + "test", + Settings.builder() + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) + .put(IndexMetadata.SETTING_NUMBER_OF_SEARCH_REPLICAS, 1) + .put(SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT) + .build() + ); + ensureGreen("test"); + // ensure primary is not on 1 or 2, + IndexShardRoutingTable routingTable = getRoutingTable(); + assertEquals(node_2, getNodeName(routingTable.primaryShard().currentNodeId())); + + String existingSearchReplicaNode = getNodeName(routingTable.searchOnlyReplicas().get(0).currentNodeId()); + String emptyAllowedNode = existingSearchReplicaNode.equals(node_0) ? node_1 : node_0; + + // set the included nodes to the other open node. + client().admin() + .cluster() + .prepareUpdateSettings() + .setTransientSettings(Settings.builder().put(SEARCH_REPLICA_ROUTING_INCLUDE_GROUP_SETTING.getKey() + "_name", emptyAllowedNode)) + .execute() + .actionGet(); + ensureGreen("test"); + + routingTable = getRoutingTable(); + assertEquals(node_2, getNodeName(routingTable.primaryShard().currentNodeId())); + assertEquals(emptyAllowedNode, getNodeName(routingTable.searchOnlyReplicas().get(0).currentNodeId())); + } + + public void testSearchReplicaDedicatedIncludes_DoNotAssignToOtherNodes() { + List nodesIds = internalCluster().startNodes(3); + final String node_0 = nodesIds.get(0); + final String node_1 = nodesIds.get(1); + final String node_2 = nodesIds.get(2); + assertEquals(3, cluster().size()); + + // set filter on 1 node and set search replica count to 2 - should leave 1 unassigned + client().admin() + .cluster() + .prepareUpdateSettings() + .setTransientSettings(Settings.builder().put(SEARCH_REPLICA_ROUTING_INCLUDE_GROUP_SETTING.getKey() + "_name", node_1)) + .execute() + .actionGet(); + + logger.info("--> creating an index with no replicas"); + createIndex( + "test", + Settings.builder() + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) + .put(IndexMetadata.SETTING_NUMBER_OF_SEARCH_REPLICAS, 2) + .put(SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT) + .build() + ); + ensureYellowAndNoInitializingShards("test"); + IndexShardRoutingTable routingTable = getRoutingTable(); + assertEquals(2, routingTable.searchOnlyReplicas().size()); + List assignedSearchShards = routingTable.searchOnlyReplicas() + .stream() + .filter(ShardRouting::assignedToNode) + .collect(Collectors.toList()); + assertEquals(1, assignedSearchShards.size()); + assertEquals(node_1, getNodeName(assignedSearchShards.get(0).currentNodeId())); + assertEquals(1, routingTable.searchOnlyReplicas().stream().filter(ShardRouting::unassigned).count()); + } + + private IndexShardRoutingTable getRoutingTable() { + IndexShardRoutingTable routingTable = getClusterState().routingTable().index("test").getShards().get(0); + return routingTable; + } + + private String getNodeName(String id) { + return getClusterState().nodes().get(id).getName(); + } +} diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/settings/SearchOnlyReplicaFeatureFlagIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/settings/SearchOnlyReplicaFeatureFlagIT.java index e5a05c04fa7ee..f3fc569f1a3fd 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/settings/SearchOnlyReplicaFeatureFlagIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/settings/SearchOnlyReplicaFeatureFlagIT.java @@ -17,6 +17,7 @@ import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SEARCH_REPLICAS; import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_REPLICATION_TYPE; +import static org.opensearch.cluster.routing.allocation.decider.FilterAllocationDecider.SEARCH_REPLICA_ROUTING_INCLUDE_GROUP_SETTING; @OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.SUITE, numDataNodes = 1) public class SearchOnlyReplicaFeatureFlagIT extends OpenSearchIntegTestCase { @@ -53,4 +54,15 @@ public void testUpdateFeatureFlagDisabled() { }); assertTrue(settingsException.getMessage().contains("unknown setting")); } + + public void testFilterAllocationSettingNotRegistered() { + expectThrows(SettingsException.class, () -> { + client().admin() + .cluster() + .prepareUpdateSettings() + .setTransientSettings(Settings.builder().put(SEARCH_REPLICA_ROUTING_INCLUDE_GROUP_SETTING.getKey() + "_name", "node")) + .execute() + .actionGet(); + }); + } } diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDecider.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDecider.java index d3200c1bc9d75..a48ba889914b7 100644 --- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDecider.java +++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDecider.java @@ -44,6 +44,7 @@ import org.opensearch.common.settings.Setting; import org.opensearch.common.settings.Setting.Property; import org.opensearch.common.settings.Settings; +import org.opensearch.common.util.FeatureFlags; import org.opensearch.node.remotestore.RemoteStoreNodeService; import java.util.Map; @@ -88,6 +89,8 @@ public class FilterAllocationDecider extends AllocationDecider { private static final String CLUSTER_ROUTING_REQUIRE_GROUP_PREFIX = "cluster.routing.allocation.require"; private static final String CLUSTER_ROUTING_INCLUDE_GROUP_PREFIX = "cluster.routing.allocation.include"; private static final String CLUSTER_ROUTING_EXCLUDE_GROUP_PREFIX = "cluster.routing.allocation.exclude"; + private static final String SEARCH_REPLICA_ROUTING_INCLUDE_GROUP_PREFIX = "cluster.routing.allocation.search.replica.dedicated.include"; + public static final Setting.AffixSetting CLUSTER_ROUTING_REQUIRE_GROUP_SETTING = Setting.prefixKeySetting( CLUSTER_ROUTING_REQUIRE_GROUP_PREFIX + ".", key -> Setting.simpleString(key, value -> IP_VALIDATOR.accept(key, value), Property.Dynamic, Property.NodeScope) @@ -100,7 +103,12 @@ public class FilterAllocationDecider extends AllocationDecider { CLUSTER_ROUTING_EXCLUDE_GROUP_PREFIX + ".", key -> Setting.simpleString(key, value -> IP_VALIDATOR.accept(key, value), Property.Dynamic, Property.NodeScope) ); + public static final Setting.AffixSetting SEARCH_REPLICA_ROUTING_INCLUDE_GROUP_SETTING = Setting.prefixKeySetting( + SEARCH_REPLICA_ROUTING_INCLUDE_GROUP_PREFIX + ".", + key -> Setting.simpleString(key, value -> IP_VALIDATOR.accept(key, value), Property.Dynamic, Property.NodeScope) + ); + private volatile DiscoveryNodeFilters searchReplicaIncludeFilters; private volatile DiscoveryNodeFilters clusterRequireFilters; private volatile DiscoveryNodeFilters clusterIncludeFilters; private volatile DiscoveryNodeFilters clusterExcludeFilters; @@ -113,7 +121,6 @@ public FilterAllocationDecider(Settings settings, ClusterSettings clusterSetting setClusterIncludeFilters(CLUSTER_ROUTING_INCLUDE_GROUP_SETTING.getAsMap(settings)); this.migrationDirection = RemoteStoreNodeService.MIGRATION_DIRECTION_SETTING.get(settings); this.compatibilityMode = RemoteStoreNodeService.REMOTE_STORE_COMPATIBILITY_MODE_SETTING.get(settings); - clusterSettings.addAffixMapUpdateConsumer(CLUSTER_ROUTING_REQUIRE_GROUP_SETTING, this::setClusterRequireFilters, (a, b) -> {}); clusterSettings.addAffixMapUpdateConsumer(CLUSTER_ROUTING_EXCLUDE_GROUP_SETTING, this::setClusterExcludeFilters, (a, b) -> {}); clusterSettings.addAffixMapUpdateConsumer(CLUSTER_ROUTING_INCLUDE_GROUP_SETTING, this::setClusterIncludeFilters, (a, b) -> {}); @@ -122,6 +129,15 @@ public FilterAllocationDecider(Settings settings, ClusterSettings clusterSetting RemoteStoreNodeService.REMOTE_STORE_COMPATIBILITY_MODE_SETTING, this::setCompatibilityMode ); + + if (FeatureFlags.isEnabled(FeatureFlags.READER_WRITER_SPLIT_EXPERIMENTAL)) { + setSearchReplicaIncludeFilters(SEARCH_REPLICA_ROUTING_INCLUDE_GROUP_SETTING.getAsMap(settings)); + clusterSettings.addAffixMapUpdateConsumer( + SEARCH_REPLICA_ROUTING_INCLUDE_GROUP_SETTING, + this::setSearchReplicaIncludeFilters, + (a, b) -> {} + ); + } } private void setMigrationDirection(RemoteStoreNodeService.Direction migrationDirection) { @@ -203,6 +219,9 @@ private Decision shouldFilter(ShardRouting shardRouting, DiscoveryNode node, Rou decision = shouldIndexFilter(allocation.metadata().getIndexSafe(shardRouting.index()), node, allocation); if (decision != null) return decision; + decision = shouldSearchReplicaShardTypeFilter(shardRouting, node, allocation); + if (decision != null) return decision; + return allocation.decision(Decision.YES, NAME, "node passes include/exclude/require filters"); } @@ -294,6 +313,32 @@ private Decision shouldClusterFilter(DiscoveryNode node, RoutingAllocation alloc return null; } + private Decision shouldSearchReplicaShardTypeFilter(ShardRouting routing, DiscoveryNode node, RoutingAllocation allocation) { + if (searchReplicaIncludeFilters != null) { + final boolean match = searchReplicaIncludeFilters.match(node); + if (match == false && routing.isSearchOnly()) { + return allocation.decision( + Decision.NO, + NAME, + "node does not match shard setting [%s] filters [%s]", + SEARCH_REPLICA_ROUTING_INCLUDE_GROUP_PREFIX, + searchReplicaIncludeFilters + ); + } + // filter will only apply to search replicas + if (routing.isSearchOnly() == false && match) { + return allocation.decision( + Decision.NO, + NAME, + "only search replicas can be allocated to node with setting [%s] filters [%s]", + SEARCH_REPLICA_ROUTING_INCLUDE_GROUP_PREFIX, + searchReplicaIncludeFilters + ); + } + } + return null; + } + private void setClusterRequireFilters(Map filters) { clusterRequireFilters = DiscoveryNodeFilters.trimTier( DiscoveryNodeFilters.buildOrUpdateFromKeyValue(clusterRequireFilters, AND, filters) @@ -311,4 +356,10 @@ private void setClusterExcludeFilters(Map filters) { DiscoveryNodeFilters.buildOrUpdateFromKeyValue(clusterExcludeFilters, OR, filters) ); } + + private void setSearchReplicaIncludeFilters(Map filters) { + searchReplicaIncludeFilters = DiscoveryNodeFilters.trimTier( + DiscoveryNodeFilters.buildOrUpdateFromKeyValue(searchReplicaIncludeFilters, OR, filters) + ); + } } diff --git a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java index 9a6b3f1118709..fb796b99c6e46 100644 --- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java @@ -805,6 +805,8 @@ public void apply(Settings value, Settings current, Settings previous) { OpenSearchOnHeapCacheSettings.EXPIRE_AFTER_ACCESS_SETTING.getConcreteSettingForNamespace( CacheType.INDICES_REQUEST_CACHE.getSettingPrefix() ) - ) + ), + List.of(FeatureFlags.READER_WRITER_SPLIT_EXPERIMENTAL), + List.of(FilterAllocationDecider.SEARCH_REPLICA_ROUTING_INCLUDE_GROUP_SETTING) ); } diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDeciderTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDeciderTests.java index 2e303887e0f1b..af5a58ab04354 100644 --- a/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDeciderTests.java +++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDeciderTests.java @@ -53,16 +53,21 @@ import org.opensearch.common.settings.Settings; import org.opensearch.node.remotestore.RemoteStoreNodeService; import org.opensearch.snapshots.EmptySnapshotsInfoService; +import org.opensearch.test.FeatureFlagSetter; import org.opensearch.test.gateway.TestGatewayAllocator; import java.util.Arrays; import java.util.Collections; +import java.util.HashSet; +import java.util.Set; import static org.opensearch.cluster.metadata.IndexMetadata.INDEX_RESIZE_SOURCE_NAME; import static org.opensearch.cluster.metadata.IndexMetadata.INDEX_RESIZE_SOURCE_UUID; import static org.opensearch.cluster.routing.ShardRoutingState.INITIALIZING; import static org.opensearch.cluster.routing.ShardRoutingState.STARTED; import static org.opensearch.cluster.routing.ShardRoutingState.UNASSIGNED; +import static org.opensearch.cluster.routing.allocation.decider.FilterAllocationDecider.SEARCH_REPLICA_ROUTING_INCLUDE_GROUP_SETTING; +import static org.opensearch.common.util.FeatureFlags.READER_WRITER_SPLIT_EXPERIMENTAL; import static org.opensearch.node.remotestore.RemoteStoreNodeService.MIGRATION_DIRECTION_SETTING; import static org.opensearch.node.remotestore.RemoteStoreNodeService.REMOTE_STORE_COMPATIBILITY_MODE_SETTING; @@ -462,4 +467,98 @@ public void testMixedModeRemoteStoreAllocation() { decision = (Decision.Single) filterAllocationDecider.canAllocate(sr, state.getRoutingNodes().node("node2"), allocation); assertEquals(decision.toString(), Type.NO, decision.type()); } + + public void testSearchReplicaRoutingDedicatedIncludes() { + FeatureFlagSetter.set(READER_WRITER_SPLIT_EXPERIMENTAL); + // we aren't using a settingsModule here so we need to set feature flag gated setting + Set> settings = new HashSet<>(ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + settings.add(SEARCH_REPLICA_ROUTING_INCLUDE_GROUP_SETTING); + ClusterSettings clusterSettings = new ClusterSettings(Settings.builder().build(), settings); + Settings initialSettings = Settings.builder() + .put("cluster.routing.allocation.search.replica.dedicated.include._id", "node1,node2") + .build(); + + FilterAllocationDecider filterAllocationDecider = new FilterAllocationDecider(initialSettings, clusterSettings); + AllocationDeciders allocationDeciders = new AllocationDeciders( + Arrays.asList( + filterAllocationDecider, + new SameShardAllocationDecider(Settings.EMPTY, clusterSettings), + new ReplicaAfterPrimaryActiveAllocationDecider() + ) + ); + AllocationService service = new AllocationService( + allocationDeciders, + new TestGatewayAllocator(), + new BalancedShardsAllocator(Settings.EMPTY), + EmptyClusterInfoService.INSTANCE, + EmptySnapshotsInfoService.INSTANCE + ); + ClusterState state = createInitialClusterState(service, Settings.EMPTY, Settings.EMPTY); + RoutingTable routingTable = state.routingTable(); + RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, state.getRoutingNodes(), state, null, null, 0); + allocation.debugDecision(true); + + ShardRouting searchReplica = ShardRouting.newUnassigned( + routingTable.index("sourceIndex").shard(0).shardId(), + false, + true, + RecoverySource.PeerRecoverySource.INSTANCE, + new UnassignedInfo(UnassignedInfo.Reason.NODE_LEFT, "") + ); + + ShardRouting regularReplica = ShardRouting.newUnassigned( + routingTable.index("sourceIndex").shard(0).shardId(), + false, + false, + RecoverySource.PeerRecoverySource.INSTANCE, + new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, "") + ); + + ShardRouting primary = ShardRouting.newUnassigned( + routingTable.index("sourceIndex").shard(0).shardId(), + true, + false, + RecoverySource.PeerRecoverySource.INSTANCE, + new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, "") + ); + + Decision.Single decision = (Decision.Single) filterAllocationDecider.canAllocate( + searchReplica, + state.getRoutingNodes().node("node2"), + allocation + ); + assertEquals(decision.toString(), Type.YES, decision.type()); + decision = (Decision.Single) filterAllocationDecider.canAllocate(searchReplica, state.getRoutingNodes().node("node1"), allocation); + assertEquals(decision.toString(), Type.YES, decision.type()); + + decision = (Decision.Single) filterAllocationDecider.canAllocate(regularReplica, state.getRoutingNodes().node("node2"), allocation); + assertEquals(decision.toString(), Type.NO, decision.type()); + decision = (Decision.Single) filterAllocationDecider.canAllocate(regularReplica, state.getRoutingNodes().node("node1"), allocation); + assertEquals(decision.toString(), Type.NO, decision.type()); + + decision = (Decision.Single) filterAllocationDecider.canAllocate(primary, state.getRoutingNodes().node("node1"), allocation); + assertEquals(decision.toString(), Type.NO, decision.type()); + decision = (Decision.Single) filterAllocationDecider.canAllocate(primary, state.getRoutingNodes().node("node2"), allocation); + assertEquals(decision.toString(), Type.NO, decision.type()); + + Settings updatedSettings = Settings.builder() + .put("cluster.routing.allocation.search.replica.dedicated.include._id", "node2") + .build(); + clusterSettings.applySettings(updatedSettings); + + decision = (Decision.Single) filterAllocationDecider.canAllocate(searchReplica, state.getRoutingNodes().node("node2"), allocation); + assertEquals(decision.toString(), Type.YES, decision.type()); + decision = (Decision.Single) filterAllocationDecider.canAllocate(searchReplica, state.getRoutingNodes().node("node1"), allocation); + assertEquals(decision.toString(), Type.NO, decision.type()); + + decision = (Decision.Single) filterAllocationDecider.canAllocate(regularReplica, state.getRoutingNodes().node("node2"), allocation); + assertEquals(decision.toString(), Type.NO, decision.type()); + decision = (Decision.Single) filterAllocationDecider.canAllocate(regularReplica, state.getRoutingNodes().node("node1"), allocation); + assertEquals(decision.toString(), Type.YES, decision.type()); + + decision = (Decision.Single) filterAllocationDecider.canAllocate(primary, state.getRoutingNodes().node("node1"), allocation); + assertEquals(decision.toString(), Type.YES, decision.type()); + decision = (Decision.Single) filterAllocationDecider.canAllocate(primary, state.getRoutingNodes().node("node2"), allocation); + assertEquals(decision.toString(), Type.NO, decision.type()); + } }