diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/config/ConfigurationProvider.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/config/ConfigurationProvider.java index 6eddc08a0304c..ef4af64ea0493 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/config/ConfigurationProvider.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/config/ConfigurationProvider.java @@ -78,4 +78,9 @@ public class ConfigurationProvider { * Configuration for caching */ private CacheConfiguration cache; + + /** + * Configuration for the health check server + */ + private HealthCheckConfiguration healthCheck; } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/config/HealthCheckConfiguration.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/config/HealthCheckConfiguration.java new file mode 100644 index 0000000000000..6eadf06288d29 --- /dev/null +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/config/HealthCheckConfiguration.java @@ -0,0 +1,9 @@ +package com.linkedin.gms.factory.config; + +import lombok.Data; + + +@Data +public class HealthCheckConfiguration { + private int cacheDurationSeconds; +} diff --git a/metadata-service/factories/src/main/resources/application.yml b/metadata-service/factories/src/main/resources/application.yml index 53a741ffe2511..80c57e334fd53 100644 --- a/metadata-service/factories/src/main/resources/application.yml +++ b/metadata-service/factories/src/main/resources/application.yml @@ -272,6 +272,9 @@ systemUpdate: backOffFactor: ${BOOTSTRAP_SYSTEM_UPDATE_BACK_OFF_FACTOR:2} # Multiplicative factor for back off, default values will result in waiting 5min 15s waitForSystemUpdate: ${BOOTSTRAP_SYSTEM_UPDATE_WAIT_FOR_SYSTEM_UPDATE:true} +healthCheck: + cacheDurationSeconds: ${HEALTH_CHECK_CACHE_DURATION_SECONDS:5} + featureFlags: showSimplifiedHomepageByDefault: ${SHOW_SIMPLIFIED_HOMEPAGE_BY_DEFAULT:false} # shows a simplified homepage with just datasets, charts and dashboards by default to users. this can be configured in user settings lineageSearchCacheEnabled: ${LINEAGE_SEARCH_CACHE_ENABLED:true} # Enables in-memory cache for searchAcrossLineage query diff --git a/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java b/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java index 608c1deb5b22e..45edcb2a6a5d9 100644 --- a/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java +++ b/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java @@ -1,9 +1,15 @@ package com.datahub.health.controller; -import com.google.common.base.Supplier; import com.google.common.base.Suppliers; +import com.linkedin.gms.factory.config.ConfigurationProvider; import io.swagger.v3.oas.annotations.tags.Tag; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.concurrent.TimeUnit; +import java.util.function.Supplier; import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest; import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; import org.elasticsearch.client.RequestOptions; @@ -26,11 +32,46 @@ public class HealthCheckController { @Autowired @Qualifier("elasticSearchRestHighLevelClient") private RestHighLevelClient elasticClient; - private Supplier> memoizedSupplier; + private final Supplier> memoizedSupplier; + + public HealthCheckController(ConfigurationProvider config) { - public HealthCheckController() { this.memoizedSupplier = Suppliers.memoizeWithExpiration( - this::getElasticHealth, 5, TimeUnit.SECONDS); + this::getElasticHealth, config.getHealthCheck().getCacheDurationSeconds(), TimeUnit.SECONDS); + } + + /** + * Combined health check endpoint for checking GMS clients. + * For now, just checks the health of the ElasticSearch client + * @return A ResponseEntity with a Map of String (component name) to ResponseEntity (the health check status of + * that component). The status code will be 200 if all components are okay, and 500 if one or more components are not + * healthy. + */ + @GetMapping(path = "/ready", produces = MediaType.APPLICATION_JSON_VALUE) + public ResponseEntity>> getCombinedHealthCheck(String... checks) { + + Map>> healthChecks = new HashMap<>(); + healthChecks.put("elasticsearch", this::getElasticHealthWithCache); + // Add new components here + + List componentsToCheck = checks != null && checks.length > 0 + ? Arrays.asList(checks) + : new ArrayList<>(healthChecks.keySet()); + + Map> componentHealth = new HashMap<>(); + for (String check : componentsToCheck) { + componentHealth.put(check, + healthChecks.getOrDefault(check, + () -> ResponseEntity.status(HttpStatus.SERVICE_UNAVAILABLE).body("Unrecognized component " + check)) + .get()); + } + + + boolean isHealthy = componentHealth.values().stream().allMatch(resp -> resp.getStatusCode() == HttpStatus.OK); + if (isHealthy) { + return ResponseEntity.ok(componentHealth); + } + return ResponseEntity.status(HttpStatus.SERVICE_UNAVAILABLE).body(componentHealth); } /** @@ -43,8 +84,8 @@ public ResponseEntity getElasticHealthWithCache() { } /** - * - * @return + * Query ElasticSearch health endpoint + * @return A response including the result from ElasticSearch */ private ResponseEntity getElasticHealth() { String responseString = null; @@ -63,6 +104,6 @@ private ResponseEntity getElasticHealth() { responseString = e.getMessage(); } } - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(responseString); + return ResponseEntity.status(HttpStatus.SERVICE_UNAVAILABLE).body(responseString); } }