diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml
index 232d74f0e53d0d..532669c44722ce 100644
--- a/.github/workflows/docker-unified.yml
+++ b/.github/workflows/docker-unified.yml
@@ -63,8 +63,8 @@ jobs:
env:
ENABLE_PUBLISH: ${{ secrets.DOCKER_PASSWORD != '' && secrets.ACRYL_DOCKER_PASSWORD != '' }}
run: |
- echo "Enable publish: ${{ env.ENABLE_PUBLISH != '' }}"
- echo "publish=${{ env.ENABLE_PUBLISH != '' }}" >> $GITHUB_OUTPUT
+ echo "Enable publish: ${{ env.ENABLE_PUBLISH }}"
+ echo "publish=${{ env.ENABLE_PUBLISH }}" >> $GITHUB_OUTPUT
gms_build:
name: Build and Push DataHub GMS Docker Image
@@ -451,8 +451,6 @@ jobs:
tags: ${{ needs.setup.outputs.tag }}
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
- build-args: |
- DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
publish: ${{ needs.setup.outputs.publish }}
context: .
file: ./docker/datahub-ingestion-base/Dockerfile
@@ -481,7 +479,7 @@ jobs:
uses: ishworkh/docker-image-artifact-download@v1
if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }}
with:
- image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
+ image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }}
- name: Build and push Base-Slim Image
if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }}
uses: ./.github/actions/docker-custom-build-and-push
@@ -493,16 +491,15 @@ jobs:
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
build-args: |
- DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
APP_ENV=slim
- BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
+ BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }}
publish: ${{ needs.setup.outputs.publish }}
context: .
file: ./docker/datahub-ingestion-base/Dockerfile
platforms: linux/amd64,linux/arm64/v8
- name: Compute DataHub Ingestion (Base-Slim) Tag
id: tag
- run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.slim_tag || 'head' }}" >> $GITHUB_OUTPUT
+ run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }}" >> $GITHUB_OUTPUT
datahub_ingestion_base_full_build:
name: Build and Push DataHub Ingestion (Base-Full) Docker Image
runs-on: ubuntu-latest
@@ -524,7 +521,7 @@ jobs:
uses: ishworkh/docker-image-artifact-download@v1
if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }}
with:
- image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
+ image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }}
- name: Build and push Base-Full Image
if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }}
uses: ./.github/actions/docker-custom-build-and-push
@@ -532,20 +529,19 @@ jobs:
target: full-install
images: |
${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
- tags: ${{ needs.setup.outputs.full_tag }}
+ tags: ${{ needs.setup.outputs.unique_full_tag }}
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
build-args: |
- DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
APP_ENV=full
- BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
+ BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }}
publish: ${{ needs.setup.outputs.publish }}
context: .
file: ./docker/datahub-ingestion-base/Dockerfile
platforms: linux/amd64,linux/arm64/v8
- name: Compute DataHub Ingestion (Base-Full) Tag
id: tag
- run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.full_tag || 'head' }}" >> $GITHUB_OUTPUT
+ run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT
datahub_ingestion_slim_build:
@@ -553,6 +549,7 @@ jobs:
runs-on: ubuntu-latest
outputs:
tag: ${{ steps.tag.outputs.tag }}
+ needs_artifact_download: ${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.publish != 'true' }}
needs: [setup, datahub_ingestion_base_slim_build]
steps:
- name: Check out the repo
@@ -572,9 +569,9 @@ jobs:
run: ./gradlew :metadata-ingestion:codegen
- name: Download Base Image
uses: ishworkh/docker-image-artifact-download@v1
- if: ${{ needs.setup.outputs.publish != 'true' }}
+ if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }}
with:
- image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.slim_tag || 'head' }}
+ image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }}
- name: Build and push Slim Image
if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }}
uses: ./.github/actions/docker-custom-build-and-push
@@ -584,7 +581,7 @@ jobs:
${{ env.DATAHUB_INGESTION_IMAGE }}
build-args: |
BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
- DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.slim_tag || 'head' }}
+ DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }}
APP_ENV=slim
tags: ${{ needs.setup.outputs.slim_tag }}
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
@@ -595,8 +592,7 @@ jobs:
platforms: linux/amd64,linux/arm64/v8
- name: Compute Tag
id: tag
- # TODO: Replace with `head` once publishing is fixed
- run: echo "tag=${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.slim_tag || 'pr8515-full' }}" >> $GITHUB_OUTPUT
+ run: echo "tag=${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.unique_slim_tag || 'head' }}" >> $GITHUB_OUTPUT
datahub_ingestion_slim_scan:
permissions:
contents: read # for actions/checkout to fetch code
@@ -610,15 +606,15 @@ jobs:
uses: actions/checkout@v3
- name: Download image Slim Image
uses: ishworkh/docker-image-artifact-download@v1
- if: ${{ needs.setup.outputs.publish != 'true' }}
+ if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }}
with:
- image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.slim_tag }}
+ image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }}
- name: Run Trivy vulnerability scanner Slim Image
uses: aquasecurity/trivy-action@0.8.0
env:
TRIVY_OFFLINE_SCAN: true
with:
- image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.slim_tag }}
+ image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }}
format: "template"
template: "@/contrib/sarif.tpl"
output: "trivy-results.sarif"
@@ -635,6 +631,7 @@ jobs:
runs-on: ubuntu-latest
outputs:
tag: ${{ steps.tag.outputs.tag }}
+ needs_artifact_download: ${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.publish != 'true' }}
needs: [setup, datahub_ingestion_base_full_build]
steps:
- name: Check out the repo
@@ -654,9 +651,9 @@ jobs:
run: ./gradlew :metadata-ingestion:codegen
- name: Download Base Image
uses: ishworkh/docker-image-artifact-download@v1
- if: ${{ needs.setup.outputs.publish != 'true' }}
+ if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }}
with:
- image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.full_tag || 'head' }}
+ image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}
- name: Build and push Full Image
if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }}
uses: ./.github/actions/docker-custom-build-and-push
@@ -666,8 +663,8 @@ jobs:
${{ env.DATAHUB_INGESTION_IMAGE }}
build-args: |
BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
- DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.full_tag || 'head' }}
- tags: ${{ needs.setup.outputs.full_tag }}
+ DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}
+ tags: ${{ needs.setup.outputs.unique_full_tag }}
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
publish: ${{ needs.setup.outputs.publish }}
@@ -676,7 +673,7 @@ jobs:
platforms: linux/amd64,linux/arm64/v8
- name: Compute Tag (Full)
id: tag
- run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.full_tag || 'head' }}" >> $GITHUB_OUTPUT
+ run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT
datahub_ingestion_full_scan:
permissions:
contents: read # for actions/checkout to fetch code
@@ -690,15 +687,15 @@ jobs:
uses: actions/checkout@v3
- name: Download image Full Image
uses: ishworkh/docker-image-artifact-download@v1
- if: ${{ needs.setup.outputs.publish != 'true' }}
+ if: ${{ needs.datahub_ingestion_full_build.outputs.needs_artifact_download == 'true' }}
with:
- image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.full_tag }}
+ image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }}
- name: Run Trivy vulnerability scanner Full Image
uses: aquasecurity/trivy-action@0.8.0
env:
TRIVY_OFFLINE_SCAN: true
with:
- image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.full_tag }}
+ image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }}
format: "template"
template: "@/contrib/sarif.tpl"
output: "trivy-results.sarif"
@@ -751,6 +748,10 @@ jobs:
./gradlew :metadata-ingestion:install
- name: Disk Check
run: df -h . && docker images
+ - name: Remove images
+ run: docker image prune -a -f || true
+ - name: Disk Check
+ run: df -h . && docker images
- name: Download GMS image
uses: ishworkh/docker-image-artifact-download@v1
if: ${{ needs.setup.outputs.publish != 'true' }}
@@ -793,9 +794,9 @@ jobs:
image: ${{ env.DATAHUB_UPGRADE_IMAGE }}:${{ needs.setup.outputs.unique_tag }}
- name: Download datahub-ingestion-slim image
uses: ishworkh/docker-image-artifact-download@v1
- if: ${{ needs.setup.outputs.publish != 'true' }}
+ if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }}
with:
- image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.setup.outputs.unique_tag }}
+ image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }}
- name: Disk Check
run: df -h . && docker images
- name: run quickstart
@@ -813,6 +814,8 @@ jobs:
# we are doing this because gms takes time to get ready
# and we don't have a better readiness check when bootstrap is done
sleep 60s
+ - name: Disk Check
+ run: df -h . && docker images
- name: Disable ES Disk Threshold
run: |
curl -XPUT "http://localhost:9200/_cluster/settings" \
diff --git a/.gitignore b/.gitignore
index 858f560f0b8429..b6edbccf711258 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,6 +69,7 @@ metadata-ingestion/generated/**
# docs
docs/generated/
+docs-website/versioned_docs/
tmp*
temp/**
diff --git a/README.md b/README.md
index d2208cf6ced490..951dcebad64986 100644
--- a/README.md
+++ b/README.md
@@ -80,7 +80,11 @@ Please follow the [DataHub Quickstart Guide](https://datahubproject.io/docs/quic
If you're looking to build & modify datahub please take a look at our [Development Guide](https://datahubproject.io/docs/developers).
-[![DataHub Demo GIF](docs/imgs/entity.png)](https://demo.datahubproject.io/)
+
+
+
+
+
## Source Code and Repositories
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java
index d6dd2de6d31e35..682710ad5d539d 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java
@@ -68,6 +68,7 @@
import com.linkedin.datahub.graphql.generated.ListQueriesResult;
import com.linkedin.datahub.graphql.generated.ListTestsResult;
import com.linkedin.datahub.graphql.generated.ListViewsResult;
+import com.linkedin.datahub.graphql.generated.MatchedField;
import com.linkedin.datahub.graphql.generated.MLFeature;
import com.linkedin.datahub.graphql.generated.MLFeatureProperties;
import com.linkedin.datahub.graphql.generated.MLFeatureTable;
@@ -1008,6 +1009,10 @@ private void configureGenericEntityResolvers(final RuntimeWiring.Builder builder
.dataFetcher("entity", new EntityTypeResolver(entityTypes,
(env) -> ((SearchResult) env.getSource()).getEntity()))
)
+ .type("MatchedField", typeWiring -> typeWiring
+ .dataFetcher("entity", new EntityTypeResolver(entityTypes,
+ (env) -> ((MatchedField) env.getSource()).getEntity()))
+ )
.type("SearchAcrossLineageResult", typeWiring -> typeWiring
.dataFetcher("entity", new EntityTypeResolver(entityTypes,
(env) -> ((SearchAcrossLineageResult) env.getSource()).getEntity()))
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java
index 94880c77d74bcc..3089b8c8fc2dba 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java
@@ -107,7 +107,31 @@ public static boolean canEditGroupMembers(@Nonnull String groupUrnStr, @Nonnull
}
public static boolean canCreateGlobalAnnouncements(@Nonnull QueryContext context) {
- return isAuthorized(context, Optional.empty(), PoliciesConfig.CREATE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE);
+ final DisjunctivePrivilegeGroup orPrivilegeGroups = new DisjunctivePrivilegeGroup(
+ ImmutableList.of(
+ new ConjunctivePrivilegeGroup(ImmutableList.of(
+ PoliciesConfig.CREATE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE.getType())),
+ new ConjunctivePrivilegeGroup(ImmutableList.of(
+ PoliciesConfig.MANAGE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE.getType()))
+ ));
+
+ return AuthorizationUtils.isAuthorized(
+ context.getAuthorizer(),
+ context.getActorUrn(),
+ orPrivilegeGroups);
+ }
+
+ public static boolean canManageGlobalAnnouncements(@Nonnull QueryContext context) {
+ final DisjunctivePrivilegeGroup orPrivilegeGroups = new DisjunctivePrivilegeGroup(
+ ImmutableList.of(
+ new ConjunctivePrivilegeGroup(ImmutableList.of(
+ PoliciesConfig.MANAGE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE.getType()))
+ ));
+
+ return AuthorizationUtils.isAuthorized(
+ context.getAuthorizer(),
+ context.getActorUrn(),
+ orPrivilegeGroups);
}
public static boolean canManageGlobalViews(@Nonnull QueryContext context) {
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/MeResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/MeResolver.java
index d2a7b19857f95f..02921b453e3154 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/MeResolver.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/MeResolver.java
@@ -74,6 +74,7 @@ public CompletableFuture get(DataFetchingEnvironment environm
platformPrivileges.setManageTags(AuthorizationUtils.canManageTags(context));
platformPrivileges.setManageGlobalViews(AuthorizationUtils.canManageGlobalViews(context));
platformPrivileges.setManageOwnershipTypes(AuthorizationUtils.canManageOwnershipTypes(context));
+ platformPrivileges.setManageGlobalAnnouncements(AuthorizationUtils.canManageGlobalAnnouncements(context));
// Construct and return authenticated user object.
final AuthenticatedUser authUser = new AuthenticatedUser();
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java
index 2c55bc79fe5017..90017f7b879972 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java
@@ -18,6 +18,7 @@
import com.linkedin.datahub.graphql.generated.Privilege;
import com.linkedin.datahub.graphql.generated.QueriesTabConfig;
import com.linkedin.datahub.graphql.generated.ResourcePrivileges;
+import com.linkedin.datahub.graphql.generated.SearchResultsVisualConfig;
import com.linkedin.datahub.graphql.generated.TelemetryConfig;
import com.linkedin.datahub.graphql.generated.TestsConfig;
import com.linkedin.datahub.graphql.generated.ViewsConfig;
@@ -144,6 +145,13 @@ public CompletableFuture get(final DataFetchingEnvironment environmen
}
visualConfig.setEntityProfiles(entityProfilesConfig);
}
+ if (_visualConfiguration != null && _visualConfiguration.getSearchResult() != null) {
+ SearchResultsVisualConfig searchResultsVisualConfig = new SearchResultsVisualConfig();
+ if (_visualConfiguration.getSearchResult().getEnableNameHighlight() != null) {
+ searchResultsVisualConfig.setEnableNameHighlight(_visualConfiguration.getSearchResult().getEnableNameHighlight());
+ }
+ visualConfig.setSearchResult(searchResultsVisualConfig);
+ }
appConfig.setVisualConfig(visualConfig);
final TelemetryConfig telemetryConfig = new TelemetryConfig();
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/post/DeletePostResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/post/DeletePostResolver.java
index cd2a3dda70033a..d3cd0126fb8527 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/post/DeletePostResolver.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/post/DeletePostResolver.java
@@ -23,7 +23,7 @@ public class DeletePostResolver implements DataFetcher get(final DataFetchingEnvironment environment) throws Exception {
final QueryContext context = environment.getContext();
- if (!AuthorizationUtils.canCreateGlobalAnnouncements(context)) {
+ if (!AuthorizationUtils.canManageGlobalAnnouncements(context)) {
throw new AuthorizationException(
"Unauthorized to delete posts. Please contact your DataHub administrator if this needs corrective action.");
}
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java
index e40bbca56b4167..fe5b79ba2ea3d6 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java
@@ -73,7 +73,6 @@ private SearchUtils() {
EntityType.CONTAINER,
EntityType.DOMAIN,
EntityType.DATA_PRODUCT,
- EntityType.ROLE,
EntityType.NOTEBOOK);
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java
index 6435d6ee4c8e55..f3ac008734339e 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java
@@ -39,6 +39,9 @@ public com.linkedin.metadata.query.SearchFlags apply(@Nonnull final SearchFlags
if (searchFlags.getSkipAggregates() != null) {
result.setSkipAggregates(searchFlags.getSkipAggregates());
}
+ if (searchFlags.getGetSuggestions() != null) {
+ result.setGetSuggestions(searchFlags.getGetSuggestions());
+ }
return result;
}
}
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java
index 0b292a373ea40e..5ba32b0c2a77c1 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java
@@ -1,12 +1,18 @@
package com.linkedin.datahub.graphql.types.mappers;
+import com.linkedin.common.urn.Urn;
import com.linkedin.datahub.graphql.generated.AggregationMetadata;
import com.linkedin.datahub.graphql.generated.FacetMetadata;
import com.linkedin.datahub.graphql.generated.MatchedField;
import com.linkedin.datahub.graphql.generated.SearchResult;
+import com.linkedin.datahub.graphql.generated.SearchSuggestion;
import com.linkedin.datahub.graphql.resolvers.EntityTypeMapper;
import com.linkedin.datahub.graphql.types.common.mappers.UrnToEntityMapper;
import com.linkedin.metadata.search.SearchEntity;
+import com.linkedin.metadata.search.utils.SearchUtils;
+import lombok.extern.slf4j.Slf4j;
+
+import java.net.URISyntaxException;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
@@ -16,6 +22,7 @@
import static com.linkedin.metadata.utils.SearchUtil.*;
+@Slf4j
public class MapperUtils {
private MapperUtils() {
@@ -54,7 +61,24 @@ public static String convertFilterValue(String filterValue, List isEnti
public static List getMatchedFieldEntry(List highlightMetadata) {
return highlightMetadata.stream()
- .map(field -> new MatchedField(field.getName(), field.getValue()))
+ .map(field -> {
+ MatchedField matchedField = new MatchedField();
+ matchedField.setName(field.getName());
+ matchedField.setValue(field.getValue());
+ if (SearchUtils.isUrn(field.getValue())) {
+ try {
+ Urn urn = Urn.createFromString(field.getValue());
+ matchedField.setEntity(UrnToEntityMapper.map(urn));
+ } catch (URISyntaxException e) {
+ log.warn("Failed to create urn from MatchedField value: {}", field.getValue(), e);
+ }
+ }
+ return matchedField;
+ })
.collect(Collectors.toList());
}
+
+ public static SearchSuggestion mapSearchSuggestion(com.linkedin.metadata.search.SearchSuggestion suggestion) {
+ return new SearchSuggestion(suggestion.getText(), suggestion.getScore(), Math.toIntExact(suggestion.getFrequency()));
+ }
}
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchResultsMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchResultsMapper.java
index 9f750820e30935..b16e2f10d1df7a 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchResultsMapper.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchResultsMapper.java
@@ -27,6 +27,7 @@ public SearchResults apply(com.linkedin.metadata.search.SearchResult input) {
final SearchResultMetadata searchResultMetadata = input.getMetadata();
result.setSearchResults(input.getEntities().stream().map(MapperUtils::mapResult).collect(Collectors.toList()));
result.setFacets(searchResultMetadata.getAggregations().stream().map(MapperUtils::mapFacet).collect(Collectors.toList()));
+ result.setSuggestions(searchResultMetadata.getSuggestions().stream().map(MapperUtils::mapSearchSuggestion).collect(Collectors.toList()));
return result;
}
diff --git a/datahub-graphql-core/src/main/resources/app.graphql b/datahub-graphql-core/src/main/resources/app.graphql
index 37183bac13f0eb..dbee24b4bf6f7d 100644
--- a/datahub-graphql-core/src/main/resources/app.graphql
+++ b/datahub-graphql-core/src/main/resources/app.graphql
@@ -125,6 +125,11 @@ type PlatformPrivileges {
Whether the user should be able to create, update, and delete ownership types.
"""
manageOwnershipTypes: Boolean!
+
+ """
+ Whether the user can create and delete posts pinned to the home page.
+ """
+ manageGlobalAnnouncements: Boolean!
}
"""
@@ -216,6 +221,11 @@ type VisualConfig {
Configuration for the queries tab
"""
entityProfiles: EntityProfilesConfig
+
+ """
+ Configuration for search results
+ """
+ searchResult: SearchResultsVisualConfig
}
"""
@@ -250,6 +260,16 @@ type EntityProfileConfig {
defaultTab: String
}
+"""
+Configuration for a search result
+"""
+type SearchResultsVisualConfig {
+ """
+ Whether a search result should highlight the name/description if it was matched on those fields.
+ """
+ enableNameHighlight: Boolean
+}
+
"""
Configurations related to tracking users in the app
"""
diff --git a/datahub-graphql-core/src/main/resources/search.graphql b/datahub-graphql-core/src/main/resources/search.graphql
index fbea66f7389553..4cabdb04afe77c 100644
--- a/datahub-graphql-core/src/main/resources/search.graphql
+++ b/datahub-graphql-core/src/main/resources/search.graphql
@@ -138,6 +138,11 @@ input SearchFlags {
Whether to skip aggregates/facets
"""
skipAggregates: Boolean
+
+ """
+ Whether to request for search suggestions on the _entityName virtualized field
+ """
+ getSuggestions: Boolean
}
"""
@@ -483,6 +488,11 @@ type SearchResults {
Candidate facet aggregations used for search filtering
"""
facets: [FacetMetadata!]
+
+ """
+ Search suggestions based on the query provided for alternate query texts
+ """
+ suggestions: [SearchSuggestion!]
}
"""
@@ -665,6 +675,11 @@ type MatchedField {
Value of the field that matched
"""
value: String!
+
+ """
+ Entity if the value is an urn
+ """
+ entity: Entity
}
"""
@@ -722,6 +737,31 @@ type AggregationMetadata {
entity: Entity
}
+"""
+A suggestion for an alternate search query given an original query compared to all
+of the entity names in our search index.
+"""
+type SearchSuggestion {
+ """
+ The suggested text based on the provided query text compared to
+ the entity name field in the search index.
+ """
+ text: String!
+
+ """
+ The "edit distance" for this suggestion. The closer this number is to 1, the
+ closer the suggested text is to the original text. The closer it is to 0, the
+ further from the original text it is.
+ """
+ score: Float
+
+ """
+ The number of entities that would match on the name field given the suggested text
+ """
+ frequency: Int
+}
+
+
"""
Input for performing an auto completion query against a single Metadata Entity
"""
diff --git a/datahub-web-react/README.md b/datahub-web-react/README.md
index 6c91b169af858f..8bf592b11a0aec 100644
--- a/datahub-web-react/README.md
+++ b/datahub-web-react/README.md
@@ -126,7 +126,9 @@ for functional configurability should reside.
to render a view associated with a particular entity type (user, dataset, etc.).
-![entity-registry](./entity-registry.png)
+
+
+
**graphql** - The React App talks to the `dathub-frontend` server using GraphQL. This module is where the *queries* issued
against the server are defined. Once defined, running `yarn run generate` will code-gen TypeScript objects to make invoking
diff --git a/datahub-web-react/src/Mocks.tsx b/datahub-web-react/src/Mocks.tsx
index dcefc7f70d7857..a2e14308e8cee2 100644
--- a/datahub-web-react/src/Mocks.tsx
+++ b/datahub-web-react/src/Mocks.tsx
@@ -1973,6 +1973,7 @@ export const mocks = [
count: 10,
filters: [],
orFilters: [],
+ searchFlags: { getSuggestions: true },
},
},
},
@@ -2033,6 +2034,7 @@ export const mocks = [
],
},
],
+ suggestions: [],
},
} as GetSearchResultsQuery,
},
@@ -2059,6 +2061,7 @@ export const mocks = [
],
},
],
+ searchFlags: { getSuggestions: true },
},
},
},
@@ -2112,6 +2115,7 @@ export const mocks = [
],
},
],
+ suggestions: [],
},
} as GetSearchResultsQuery,
},
@@ -2230,6 +2234,7 @@ export const mocks = [
],
},
],
+ searchFlags: { getSuggestions: true },
},
},
},
@@ -2251,6 +2256,7 @@ export const mocks = [
insights: [],
},
],
+ suggestions: [],
facets: [
{
field: 'origin',
@@ -2772,6 +2778,7 @@ export const mocks = [
],
},
],
+ searchFlags: { getSuggestions: true },
},
},
},
@@ -2794,6 +2801,7 @@ export const mocks = [
insights: [],
},
],
+ suggestions: [],
facets: [
{
__typename: 'FacetMetadata',
@@ -2886,6 +2894,7 @@ export const mocks = [
],
},
],
+ searchFlags: { getSuggestions: true },
},
},
},
@@ -2908,6 +2917,7 @@ export const mocks = [
},
],
facets: [],
+ suggestions: [],
},
} as GetSearchResultsForMultipleQuery,
},
@@ -2934,6 +2944,7 @@ export const mocks = [
],
},
],
+ searchFlags: { getSuggestions: true },
},
},
},
@@ -2955,6 +2966,7 @@ export const mocks = [
insights: [],
},
],
+ suggestions: [],
facets: [
{
field: 'origin',
@@ -3007,6 +3019,7 @@ export const mocks = [
],
},
],
+ searchFlags: { getSuggestions: true },
},
},
},
@@ -3028,6 +3041,7 @@ export const mocks = [
insights: [],
},
],
+ suggestions: [],
facets: [
{
field: 'origin',
@@ -3084,6 +3098,7 @@ export const mocks = [
],
},
],
+ searchFlags: { getSuggestions: true },
},
},
},
@@ -3113,6 +3128,7 @@ export const mocks = [
insights: [],
},
],
+ suggestions: [],
facets: [
{
field: 'origin',
@@ -3175,6 +3191,7 @@ export const mocks = [
],
},
],
+ searchFlags: { getSuggestions: true },
},
},
},
@@ -3196,6 +3213,7 @@ export const mocks = [
insights: [],
},
],
+ suggestions: [],
facets: [
{
field: 'origin',
@@ -3258,6 +3276,7 @@ export const mocks = [
],
},
],
+ searchFlags: { getSuggestions: true },
},
},
},
@@ -3279,6 +3298,7 @@ export const mocks = [
insights: [],
},
],
+ suggestions: [],
facets: [
{
field: 'origin',
@@ -3363,6 +3383,7 @@ export const mocks = [
generatePersonalAccessTokens: true,
manageGlobalViews: true,
manageOwnershipTypes: true,
+ manageGlobalAnnouncements: true,
},
},
},
@@ -3450,6 +3471,7 @@ export const mocks = [
count: 10,
filters: [],
orFilters: [],
+ searchFlags: { getSuggestions: true },
},
},
},
@@ -3461,6 +3483,7 @@ export const mocks = [
total: 0,
searchResults: [],
facets: [],
+ suggestions: [],
},
},
},
@@ -3609,4 +3632,5 @@ export const platformPrivileges: PlatformPrivileges = {
createDomains: true,
manageGlobalViews: true,
manageOwnershipTypes: true,
+ manageGlobalAnnouncements: true,
};
diff --git a/datahub-web-react/src/app/entity/EntityRegistry.tsx b/datahub-web-react/src/app/entity/EntityRegistry.tsx
index a07fd02841197c..56b085cf69f4aa 100644
--- a/datahub-web-react/src/app/entity/EntityRegistry.tsx
+++ b/datahub-web-react/src/app/entity/EntityRegistry.tsx
@@ -1,5 +1,7 @@
+import React from 'react';
import { Entity as EntityInterface, EntityType, SearchResult } from '../../types.generated';
import { FetchedEntity } from '../lineage/types';
+import { SearchResultProvider } from '../search/context/SearchResultContext';
import { Entity, EntityCapabilityType, IconStyleType, PreviewType } from './Entity';
import { GLOSSARY_ENTITY_TYPES } from './shared/constants';
import { GenericEntityProperties } from './shared/types';
@@ -119,7 +121,9 @@ export default class EntityRegistry {
renderSearchResult(type: EntityType, searchResult: SearchResult): JSX.Element {
const entity = validatedGet(type, this.entityTypeToEntity);
- return entity.renderSearch(searchResult);
+ return (
+ {entity.renderSearch(searchResult)}
+ );
}
renderBrowse(type: EntityType, data: T): JSX.Element {
diff --git a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx
index b5ebcbef803792..0f1b6dbf3d660d 100644
--- a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx
+++ b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx
@@ -19,13 +19,14 @@ import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown';
import { LineageTab } from '../shared/tabs/Lineage/LineageTab';
import { ChartStatsSummarySubHeader } from './profile/stats/ChartStatsSummarySubHeader';
import { InputFieldsTab } from '../shared/tabs/Entity/InputFieldsTab';
-import { ChartSnippet } from './ChartSnippet';
import { EmbedTab } from '../shared/tabs/Embed/EmbedTab';
import { capitalizeFirstLetterOnly } from '../../shared/textUtil';
import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection';
import { getDataProduct } from '../shared/utils';
import EmbeddedProfile from '../shared/embed/EmbeddedProfile';
import { LOOKER_URN } from '../../ingest/source/builder/constants';
+import { MatchedFieldList } from '../../search/matches/MatchedFieldList';
+import { matchedInputFieldRenderer } from '../../search/matches/matchedInputFieldRenderer';
/**
* Definition of the DataHub Chart entity.
@@ -203,7 +204,11 @@ export class ChartEntity implements Entity {
lastUpdatedMs={data.properties?.lastModified?.time}
createdMs={data.properties?.created?.time}
externalUrl={data.properties?.externalUrl}
- snippet={ }
+ snippet={
+ matchedInputFieldRenderer(matchedField, data)}
+ />
+ }
degree={(result as any).degree}
paths={(result as any).paths}
/>
diff --git a/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx b/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx
deleted file mode 100644
index 27982d3037207a..00000000000000
--- a/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx
+++ /dev/null
@@ -1,53 +0,0 @@
-import React from 'react';
-
-import { Typography } from 'antd';
-import { InputFields, MatchedField, Maybe } from '../../../types.generated';
-import TagTermGroup from '../../shared/tags/TagTermGroup';
-import { FIELDS_TO_HIGHLIGHT } from '../dataset/search/highlights';
-import { getMatchPrioritizingPrimary } from '../shared/utils';
-
-type Props = {
- matchedFields: MatchedField[];
- inputFields: Maybe | undefined;
- isMatchingDashboard?: boolean;
-};
-
-const LABEL_INDEX_NAME = 'fieldLabels';
-const TYPE_PROPERTY_KEY_NAME = 'type';
-
-export const ChartSnippet = ({ matchedFields, inputFields, isMatchingDashboard = false }: Props) => {
- const matchedField = getMatchPrioritizingPrimary(matchedFields, 'fieldLabels');
-
- if (matchedField?.name === LABEL_INDEX_NAME) {
- const matchedSchemaField = inputFields?.fields?.find(
- (field) => field?.schemaField?.label === matchedField.value,
- );
- const matchedGlossaryTerm = matchedSchemaField?.schemaField?.glossaryTerms?.terms?.find(
- (term) => term?.term?.name === matchedField.value,
- );
-
- if (matchedGlossaryTerm) {
- let termType = 'term';
- const typeProperty = matchedGlossaryTerm.term.properties?.customProperties?.find(
- (property) => property.key === TYPE_PROPERTY_KEY_NAME,
- );
- if (typeProperty) {
- termType = typeProperty.value || termType;
- }
-
- return (
-
- Matches {termType} {' '}
- {isMatchingDashboard && 'on a contained Chart'}
-
- );
- }
- }
-
- return matchedField ? (
-
- Matches {FIELDS_TO_HIGHLIGHT.get(matchedField.name)} {matchedField.value} {' '}
- {isMatchingDashboard && 'on a contained Chart'}
-
- ) : null;
-};
diff --git a/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx b/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx
index a64e4372652620..0a36d0e5f1bfad 100644
--- a/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx
+++ b/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx
@@ -24,12 +24,13 @@ import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown';
import { LineageTab } from '../shared/tabs/Lineage/LineageTab';
import { capitalizeFirstLetterOnly } from '../../shared/textUtil';
import { DashboardStatsSummarySubHeader } from './profile/DashboardStatsSummarySubHeader';
-import { ChartSnippet } from '../chart/ChartSnippet';
import { EmbedTab } from '../shared/tabs/Embed/EmbedTab';
import EmbeddedProfile from '../shared/embed/EmbeddedProfile';
import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection';
import { getDataProduct } from '../shared/utils';
import { LOOKER_URN } from '../../ingest/source/builder/constants';
+import { MatchedFieldList } from '../../search/matches/MatchedFieldList';
+import { matchedInputFieldRenderer } from '../../search/matches/matchedInputFieldRenderer';
/**
* Definition of the DataHub Dashboard entity.
@@ -227,10 +228,9 @@ export class DashboardEntity implements Entity {
lastUpdatedMs={data.properties?.lastModified?.time}
createdMs={data.properties?.created?.time}
snippet={
- matchedInputFieldRenderer(matchedField, data)}
+ matchSuffix="on a contained chart"
/>
}
subtype={data.subTypes?.typeNames?.[0]}
diff --git a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx
index cb4239872045f6..ed3904bcf4e2d6 100644
--- a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx
+++ b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx
@@ -25,11 +25,12 @@ import { OperationsTab } from './profile/OperationsTab';
import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown';
import { SidebarSiblingsSection } from '../shared/containers/profile/sidebar/SidebarSiblingsSection';
import { DatasetStatsSummarySubHeader } from './profile/stats/stats/DatasetStatsSummarySubHeader';
-import { DatasetSearchSnippet } from './DatasetSearchSnippet';
+import { MatchedFieldList } from '../../search/matches/MatchedFieldList';
import { EmbedTab } from '../shared/tabs/Embed/EmbedTab';
import EmbeddedProfile from '../shared/embed/EmbeddedProfile';
import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection';
import { getDataProduct } from '../shared/utils';
+import { matchedFieldPathsRenderer } from '../../search/matches/matchedFieldPathsRenderer';
const SUBTYPES = {
VIEW: 'view',
@@ -290,7 +291,7 @@ export class DatasetEntity implements Entity {
subtype={data.subTypes?.typeNames?.[0]}
container={data.container}
parentContainers={data.parentContainers}
- snippet={ }
+ snippet={ }
insights={result.insights}
externalUrl={data.properties?.externalUrl}
statsSummary={data.statsSummary}
diff --git a/datahub-web-react/src/app/entity/dataset/DatasetSearchSnippet.tsx b/datahub-web-react/src/app/entity/dataset/DatasetSearchSnippet.tsx
deleted file mode 100644
index e4f88eb0fbbfa2..00000000000000
--- a/datahub-web-react/src/app/entity/dataset/DatasetSearchSnippet.tsx
+++ /dev/null
@@ -1,39 +0,0 @@
-import React from 'react';
-
-import { Typography } from 'antd';
-import { MatchedField } from '../../../types.generated';
-import { TagSummary } from './shared/TagSummary';
-import { TermSummary } from './shared/TermSummary';
-import { FIELDS_TO_HIGHLIGHT } from './search/highlights';
-import { getMatchPrioritizingPrimary } from '../shared/utils';
-import { downgradeV2FieldPath } from './profile/schema/utils/utils';
-
-type Props = {
- matchedFields: MatchedField[];
-};
-
-const LABEL_INDEX_NAME = 'fieldLabels';
-
-export const DatasetSearchSnippet = ({ matchedFields }: Props) => {
- const matchedField = getMatchPrioritizingPrimary(matchedFields, LABEL_INDEX_NAME);
-
- let snippet: React.ReactNode;
-
- if (matchedField) {
- if (matchedField.value.includes('urn:li:tag')) {
- snippet = ;
- } else if (matchedField.value.includes('urn:li:glossaryTerm')) {
- snippet = ;
- } else if (matchedField.name === 'fieldPaths') {
- snippet = {downgradeV2FieldPath(matchedField.value)} ;
- } else {
- snippet = {matchedField.value} ;
- }
- }
-
- return matchedField ? (
-
- Matches {FIELDS_TO_HIGHLIGHT.get(matchedField.name)} {snippet}{' '}
-
- ) : null;
-};
diff --git a/datahub-web-react/src/app/entity/dataset/search/highlights.ts b/datahub-web-react/src/app/entity/dataset/search/highlights.ts
deleted file mode 100644
index 64505e0709c7ba..00000000000000
--- a/datahub-web-react/src/app/entity/dataset/search/highlights.ts
+++ /dev/null
@@ -1,7 +0,0 @@
-export const FIELDS_TO_HIGHLIGHT = new Map();
-FIELDS_TO_HIGHLIGHT.set('fieldPaths', 'column');
-FIELDS_TO_HIGHLIGHT.set('fieldDescriptions', 'column description');
-FIELDS_TO_HIGHLIGHT.set('fieldTags', 'column tag');
-FIELDS_TO_HIGHLIGHT.set('editedFieldDescriptions', 'column description');
-FIELDS_TO_HIGHLIGHT.set('editedFieldTags', 'column tag');
-FIELDS_TO_HIGHLIGHT.set('fieldLabels', 'label');
diff --git a/datahub-web-react/src/app/entity/dataset/shared/TagSummary.tsx b/datahub-web-react/src/app/entity/dataset/shared/TagSummary.tsx
deleted file mode 100644
index 106cc298fb58c4..00000000000000
--- a/datahub-web-react/src/app/entity/dataset/shared/TagSummary.tsx
+++ /dev/null
@@ -1,38 +0,0 @@
-import React from 'react';
-import styled from 'styled-components';
-import { useGetTagQuery } from '../../../../graphql/tag.generated';
-import { EntityType, Tag } from '../../../../types.generated';
-import { HoverEntityTooltip } from '../../../recommendations/renderer/component/HoverEntityTooltip';
-import { useEntityRegistry } from '../../../useEntityRegistry';
-import { StyledTag } from '../../shared/components/styled/StyledTag';
-
-const TagLink = styled.span`
- display: inline-block;
-`;
-
-type Props = {
- urn: string;
-};
-
-export const TagSummary = ({ urn }: Props) => {
- const entityRegistry = useEntityRegistry();
- const { data } = useGetTagQuery({ variables: { urn } });
- return (
- <>
- {data && (
-
-
-
- {entityRegistry.getDisplayName(EntityType.Tag, data?.tag)}
-
-
-
- )}
- >
- );
-};
diff --git a/datahub-web-react/src/app/entity/dataset/shared/TermSummary.tsx b/datahub-web-react/src/app/entity/dataset/shared/TermSummary.tsx
deleted file mode 100644
index cc1274693a3420..00000000000000
--- a/datahub-web-react/src/app/entity/dataset/shared/TermSummary.tsx
+++ /dev/null
@@ -1,36 +0,0 @@
-import React from 'react';
-import { Tag } from 'antd';
-import { BookOutlined } from '@ant-design/icons';
-import styled from 'styled-components';
-import { useGetGlossaryTermQuery } from '../../../../graphql/glossaryTerm.generated';
-import { HoverEntityTooltip } from '../../../recommendations/renderer/component/HoverEntityTooltip';
-import { EntityType, GlossaryTerm } from '../../../../types.generated';
-import { useEntityRegistry } from '../../../useEntityRegistry';
-
-const TermLink = styled.span`
- display: inline-block;
-`;
-
-type Props = {
- urn: string;
-};
-
-export const TermSummary = ({ urn }: Props) => {
- const entityRegistry = useEntityRegistry();
- const { data } = useGetGlossaryTermQuery({ variables: { urn } });
-
- return (
- <>
- {data && (
-
-
-
-
- {entityRegistry.getDisplayName(EntityType.GlossaryTerm, data?.glossaryTerm)}
-
-
-
- )}
- >
- );
-};
diff --git a/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx b/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx
index 26d3cf456ab7a6..b6802e37652cb0 100644
--- a/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx
+++ b/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx
@@ -4,6 +4,8 @@ import { Deprecation, Domain, EntityType, Owner, ParentNodesResult } from '../..
import DefaultPreviewCard from '../../../preview/DefaultPreviewCard';
import { useEntityRegistry } from '../../../useEntityRegistry';
import { IconStyleType, PreviewType } from '../../Entity';
+import UrlButton from '../../shared/UrlButton';
+import { getRelatedEntitiesUrl } from '../utils';
export const Preview = ({
urn,
@@ -39,6 +41,9 @@ export const Preview = ({
deprecation={deprecation}
parentNodes={parentNodes}
domain={domain}
+ entityTitleSuffix={
+ View Related Entities
+ }
/>
);
};
diff --git a/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx b/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx
index d0e8de0928b487..098e97e526fd84 100644
--- a/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx
+++ b/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx
@@ -5,7 +5,7 @@ import { EmbeddedListSearchSection } from '../../shared/components/styled/search
import { useEntityData } from '../../shared/EntityContext';
export default function GlossaryRelatedEntity() {
- const { entityData }: any = useEntityData();
+ const { entityData } = useEntityData();
const entityUrn = entityData?.urn;
diff --git a/datahub-web-react/src/app/entity/glossaryTerm/utils.ts b/datahub-web-react/src/app/entity/glossaryTerm/utils.ts
index 3a2a3d35a8126f..cbfa76fa348663 100644
--- a/datahub-web-react/src/app/entity/glossaryTerm/utils.ts
+++ b/datahub-web-react/src/app/entity/glossaryTerm/utils.ts
@@ -6,3 +6,7 @@ export function sortGlossaryTerms(entityRegistry: EntityRegistry, nodeA?: Entity
const nodeBName = entityRegistry.getDisplayName(EntityType.GlossaryTerm, nodeB) || '';
return nodeAName.localeCompare(nodeBName);
}
+
+export function getRelatedEntitiesUrl(entityRegistry: EntityRegistry, urn: string) {
+ return `${entityRegistry.getEntityUrl(EntityType.GlossaryTerm, urn)}/${encodeURIComponent('Related Entities')}`;
+}
diff --git a/datahub-web-react/src/app/entity/group/preview/Preview.tsx b/datahub-web-react/src/app/entity/group/preview/Preview.tsx
index dc83f6fe4f840f..67449b9a481f09 100644
--- a/datahub-web-react/src/app/entity/group/preview/Preview.tsx
+++ b/datahub-web-react/src/app/entity/group/preview/Preview.tsx
@@ -8,6 +8,7 @@ import { useEntityRegistry } from '../../../useEntityRegistry';
import { ANTD_GRAY } from '../../shared/constants';
import { IconStyleType } from '../../Entity';
import NoMarkdownViewer from '../../shared/components/styled/StripMarkdownText';
+import SearchTextHighlighter from '../../../search/matches/SearchTextHighlighter';
const PreviewContainer = styled.div`
margin-bottom: 4px;
@@ -87,7 +88,9 @@ export const Preview = ({
{entityRegistry.getEntityName(EntityType.CorpGroup)}
- {name || urn}
+
+ {name ? : urn}
+
{membersCount} members
@@ -96,7 +99,12 @@ export const Preview = ({
{description && description.length > 0 && (
- {description}
+ }
+ >
+ {description}
+
)}
diff --git a/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx b/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx
index 9677af07766042..dce74c02cdb345 100644
--- a/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx
+++ b/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx
@@ -1,28 +1,11 @@
-import { ArrowRightOutlined } from '@ant-design/icons';
-import { Button } from 'antd';
import React from 'react';
-import styled from 'styled-components/macro';
import { EntityType } from '../../../types.generated';
import analytics, { EventType, EntityActionType } from '../../analytics';
+import UrlButton from './UrlButton';
const GITHUB_LINK = 'github.com';
const GITHUB = 'GitHub';
-const ExternalUrlWrapper = styled.span`
- font-size: 12px;
-`;
-
-const StyledButton = styled(Button)`
- > :hover {
- text-decoration: underline;
- }
- &&& {
- padding-bottom: 0px;
- }
- padding-left: 12px;
- padding-right: 12px;
-`;
-
interface Props {
externalUrl: string;
platformName?: string;
@@ -46,17 +29,8 @@ export default function ExternalUrlButton({ externalUrl, platformName, entityTyp
}
return (
-
-
- {displayedName ? `View in ${displayedName}` : 'View link'}{' '}
-
-
-
+
+ {displayedName ? `View in ${displayedName}` : 'View link'}
+
);
}
diff --git a/datahub-web-react/src/app/entity/shared/UrlButton.tsx b/datahub-web-react/src/app/entity/shared/UrlButton.tsx
new file mode 100644
index 00000000000000..a6f6da4a60ad57
--- /dev/null
+++ b/datahub-web-react/src/app/entity/shared/UrlButton.tsx
@@ -0,0 +1,37 @@
+import React, { ReactNode } from 'react';
+import { ArrowRightOutlined } from '@ant-design/icons';
+import { Button } from 'antd';
+import styled from 'styled-components/macro';
+
+const UrlButtonContainer = styled.span`
+ font-size: 12px;
+`;
+
+const StyledButton = styled(Button)`
+ > :hover {
+ text-decoration: underline;
+ }
+ &&& {
+ padding-bottom: 0px;
+ }
+ padding-left: 12px;
+ padding-right: 12px;
+`;
+
+interface Props {
+ href: string;
+ children: ReactNode;
+ onClick?: () => void;
+}
+
+const NOOP = () => {};
+
+export default function UrlButton({ href, children, onClick = NOOP }: Props) {
+ return (
+
+
+ {children}
+
+
+ );
+}
diff --git a/datahub-web-react/src/app/entity/shared/__tests__/utils.test.ts b/datahub-web-react/src/app/entity/shared/__tests__/utils.test.ts
deleted file mode 100644
index 86dec46528b494..00000000000000
--- a/datahub-web-react/src/app/entity/shared/__tests__/utils.test.ts
+++ /dev/null
@@ -1,37 +0,0 @@
-import { getMatchPrioritizingPrimary } from '../utils';
-
-const MOCK_MATCHED_FIELDS = [
- {
- name: 'fieldPaths',
- value: 'rain',
- },
- {
- name: 'description',
- value: 'rainbow',
- },
- {
- name: 'fieldPaths',
- value: 'rainbow',
- },
- {
- name: 'fieldPaths',
- value: 'rainbows',
- },
-];
-
-describe('utils', () => {
- describe('getMatchPrioritizingPrimary', () => {
- it('prioritizes exact match', () => {
- global.window.location.search = 'query=rainbow';
- const match = getMatchPrioritizingPrimary(MOCK_MATCHED_FIELDS, 'fieldPaths');
- expect(match?.value).toEqual('rainbow');
- expect(match?.name).toEqual('fieldPaths');
- });
- it('will accept first contains match', () => {
- global.window.location.search = 'query=bow';
- const match = getMatchPrioritizingPrimary(MOCK_MATCHED_FIELDS, 'fieldPaths');
- expect(match?.value).toEqual('rainbow');
- expect(match?.name).toEqual('fieldPaths');
- });
- });
-});
diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx
index 59293c2b0eee5b..212813ffcb6435 100644
--- a/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx
+++ b/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx
@@ -17,6 +17,7 @@ export type Props = {
suffix?: JSX.Element;
limit?: number;
shouldWrap?: boolean;
+ customRender?: (text: string) => JSX.Element;
};
export const removeMarkdown = (text: string) => {
@@ -29,7 +30,7 @@ export const removeMarkdown = (text: string) => {
.replace(/^•/, ''); // remove first •
};
-export default function NoMarkdownViewer({ children, readMore, suffix, limit, shouldWrap }: Props) {
+export default function NoMarkdownViewer({ children, customRender, readMore, suffix, limit, shouldWrap }: Props) {
let plainText = removeMarkdown(children || '');
if (limit) {
@@ -44,7 +45,8 @@ export default function NoMarkdownViewer({ children, readMore, suffix, limit, sh
return (
- {plainText} {showReadMore && <>{readMore}>} {suffix}
+ {customRender ? customRender(plainText) : plainText}
+ {showReadMore && <>{readMore}>} {suffix}
);
}
diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx
index c1a23811fdd7e0..08087bfd79b8e9 100644
--- a/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx
+++ b/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx
@@ -6,7 +6,15 @@ export const generateColor = new ColorHash({
saturation: 0.9,
});
-export const StyledTag = styled(Tag)<{ $color: any; $colorHash?: string; fontSize?: number }>`
+export const StyledTag = styled(Tag)<{ $color: any; $colorHash?: string; fontSize?: number; highlightTag?: boolean }>`
+ &&& {
+ ${(props) =>
+ props.highlightTag &&
+ `
+ background: ${props.theme.styles['highlight-color']};
+ border: 1px solid ${props.theme.styles['highlight-border-color']};
+ `}
+ }
${(props) => props.fontSize && `font-size: ${props.fontSize}px;`}
${(props) =>
props.$colorHash &&
diff --git a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx
index 1aef497ced57bf..bcce994c3f0f80 100644
--- a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx
+++ b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx
@@ -33,7 +33,7 @@ type LinkListProps = {
};
export const LinkList = ({ refetch }: LinkListProps) => {
- const { entityData } = useEntityData();
+ const { urn: entityUrn, entityData } = useEntityData();
const entityRegistry = useEntityRegistry();
const [removeLinkMutation] = useRemoveLinkMutation();
const links = entityData?.institutionalMemory?.elements || [];
@@ -41,7 +41,7 @@ export const LinkList = ({ refetch }: LinkListProps) => {
const handleDeleteLink = async (metadata: InstitutionalMemoryMetadata) => {
try {
await removeLinkMutation({
- variables: { input: { linkUrl: metadata.url, resourceUrn: metadata.associatedUrn } },
+ variables: { input: { linkUrl: metadata.url, resourceUrn: metadata.associatedUrn || entityUrn } },
});
message.success({ content: 'Link Removed', duration: 2 });
} catch (e: unknown) {
diff --git a/datahub-web-react/src/app/entity/shared/utils.ts b/datahub-web-react/src/app/entity/shared/utils.ts
index 7ec604785d1ffe..a158cc9b7c119b 100644
--- a/datahub-web-react/src/app/entity/shared/utils.ts
+++ b/datahub-web-react/src/app/entity/shared/utils.ts
@@ -1,9 +1,7 @@
-import * as QueryString from 'query-string';
import { Maybe } from 'graphql/jsutils/Maybe';
-import { Entity, EntityType, MatchedField, EntityRelationshipsResult, DataProduct } from '../../../types.generated';
+import { Entity, EntityType, EntityRelationshipsResult, DataProduct } from '../../../types.generated';
import { capitalizeFirstLetterOnly } from '../../shared/textUtil';
-import { FIELDS_TO_HIGHLIGHT } from '../dataset/search/highlights';
import { GenericEntityProperties } from './types';
export function dictToQueryStringParams(params: Record) {
@@ -87,46 +85,6 @@ export const isListSubset = (l1, l2): boolean => {
return l1.every((result) => l2.indexOf(result) >= 0);
};
-function normalize(value: string) {
- return value.trim().toLowerCase();
-}
-
-function fromQueryGetBestMatch(selectedMatchedFields: MatchedField[], rawQuery: string) {
- const query = normalize(rawQuery);
- // first lets see if there's an exact match between a field value and the query
- const exactMatch = selectedMatchedFields.find((field) => normalize(field.value) === query);
- if (exactMatch) {
- return exactMatch;
- }
-
- // if no exact match exists, we'll see if the entire query is contained in any of the values
- const containedMatch = selectedMatchedFields.find((field) => normalize(field.value).includes(query));
- if (containedMatch) {
- return containedMatch;
- }
-
- // otherwise, just return whichever is first
- return selectedMatchedFields[0];
-}
-
-export const getMatchPrioritizingPrimary = (
- matchedFields: MatchedField[],
- primaryField: string,
-): MatchedField | undefined => {
- const { location } = window;
- const params = QueryString.parse(location.search, { arrayFormat: 'comma' });
- const query: string = decodeURIComponent(params.query ? (params.query as string) : '');
-
- const primaryMatches = matchedFields.filter((field) => field.name === primaryField);
- if (primaryMatches.length > 0) {
- return fromQueryGetBestMatch(primaryMatches, query);
- }
-
- const matchesThatShouldBeShownOnFE = matchedFields.filter((field) => FIELDS_TO_HIGHLIGHT.has(field.name));
-
- return fromQueryGetBestMatch(matchesThatShouldBeShownOnFE, query);
-};
-
function getGraphqlErrorCode(e) {
if (e.graphQLErrors && e.graphQLErrors.length) {
const firstError = e.graphQLErrors[0];
diff --git a/datahub-web-react/src/app/entity/user/preview/Preview.tsx b/datahub-web-react/src/app/entity/user/preview/Preview.tsx
index 01f68d9065523a..8893d4ab867865 100644
--- a/datahub-web-react/src/app/entity/user/preview/Preview.tsx
+++ b/datahub-web-react/src/app/entity/user/preview/Preview.tsx
@@ -7,6 +7,7 @@ import { useEntityRegistry } from '../../../useEntityRegistry';
import { ANTD_GRAY } from '../../shared/constants';
import { IconStyleType } from '../../Entity';
import { CustomAvatar } from '../../../shared/avatar';
+import SearchTextHighlighter from '../../../search/matches/SearchTextHighlighter';
const PreviewContainer = styled.div`
display: flex;
@@ -80,11 +81,17 @@ export const Preview = ({
{entityRegistry.getEntityName(EntityType.CorpUser)}
- {name || urn}
+
+ {name ? : urn}
+
- {title && {title} }
+ {title && (
+
+
+
+ )}
diff --git a/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx b/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx
index 03689460eb02bf..eda9b7d7fe2a45 100644
--- a/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx
+++ b/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx
@@ -1,4 +1,4 @@
-import React, { useEffect, useRef, useState } from 'react';
+import React, { CSSProperties, useEffect, useRef, useState } from 'react';
import { useHistory } from 'react-router';
import { Select } from 'antd';
import styled from 'styled-components';
@@ -55,11 +55,21 @@ const ViewSelectContainer = styled.div`
.ant-select-selection-item {
font-weight: 700;
font-size: 14px;
+ text-align: left;
}
}
}
`;
+const SelectStyled = styled(Select)`
+ min-width: 90px;
+ max-width: 200px;
+`;
+
+type Props = {
+ dropdownStyle?: CSSProperties;
+};
+
/**
* The View Select component allows you to select a View to apply to query on the current page. For example,
* search, recommendations, and browse.
@@ -69,7 +79,7 @@ const ViewSelectContainer = styled.div`
*
* In the event that a user refreshes their browser, the state of the view should be saved as well.
*/
-export const ViewSelect = () => {
+export const ViewSelect = ({ dropdownStyle = {} }: Props) => {
const history = useHistory();
const userContext = useUserContext();
const [isOpen, setIsOpen] = useState(false);
@@ -188,12 +198,11 @@ export const ViewSelect = () => {
return (
- (selectRef?.current as any)?.blur()}
value={(foundSelectedUrn && selectedUrn) || undefined}
- placeholder="All Entities"
+ placeholder="View all"
onSelect={onSelectView}
onClear={onClear}
ref={selectRef}
@@ -202,8 +211,8 @@ export const ViewSelect = () => {
dropdownMatchSelectWidth={false}
suffixIcon={ }
dropdownStyle={{
- position: 'fixed',
paddingBottom: 0,
+ ...dropdownStyle,
}}
onDropdownVisibleChange={handleDropdownVisibleChange}
dropdownRender={(menu) => (
@@ -237,7 +246,7 @@ export const ViewSelect = () => {
onClickEditView,
onClickPreviewView,
})}
-
+
{viewBuilderDisplayState.visible && (
{
ref={clearButtonRef}
onClick={onHandleClickClear}
>
- All Entities
+ View all
);
diff --git a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx
index 36713cfb7ffcfa..0d0a32f7750a88 100644
--- a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx
+++ b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx
@@ -34,6 +34,7 @@ import ExternalUrlButton from '../entity/shared/ExternalUrlButton';
import EntityPaths from './EntityPaths/EntityPaths';
import { DataProductLink } from '../shared/tags/DataProductLink';
import { EntityHealth } from '../entity/shared/containers/profile/header/EntityHealth';
+import SearchTextHighlighter from '../search/matches/SearchTextHighlighter';
import { getUniqueOwners } from './utils';
const PreviewContainer = styled.div`
@@ -173,6 +174,7 @@ interface Props {
deprecation?: Deprecation | null;
topUsers?: Array | null;
externalUrl?: string | null;
+ entityTitleSuffix?: React.ReactNode;
subHeader?: React.ReactNode;
snippet?: React.ReactNode;
insights?: Array | null;
@@ -225,6 +227,7 @@ export default function DefaultPreviewCard({
titleSizePx,
dataTestID,
externalUrl,
+ entityTitleSuffix,
onClick,
degree,
parentContainers,
@@ -289,7 +292,7 @@ export default function DefaultPreviewCard({
) : (
- {name || ' '}
+
)}
@@ -305,6 +308,7 @@ export default function DefaultPreviewCard({
entityType={type}
/>
)}
+ {entityTitleSuffix}
{degree !== undefined && degree !== null && (
) : undefined
}
+ customRender={(text) => }
>
{description}
diff --git a/datahub-web-react/src/app/search/EmptySearchResults.tsx b/datahub-web-react/src/app/search/EmptySearchResults.tsx
new file mode 100644
index 00000000000000..cde61f746d35bd
--- /dev/null
+++ b/datahub-web-react/src/app/search/EmptySearchResults.tsx
@@ -0,0 +1,90 @@
+import { RocketOutlined } from '@ant-design/icons';
+import { useHistory } from 'react-router';
+import { Button } from 'antd';
+import React, { useCallback } from 'react';
+import styled from 'styled-components';
+import { ANTD_GRAY_V2 } from '../entity/shared/constants';
+import { navigateToSearchUrl } from './utils/navigateToSearchUrl';
+import analytics, { EventType } from '../analytics';
+import { SuggestedText } from './suggestions/SearchQuerySugggester';
+import useGetSearchQueryInputs from './useGetSearchQueryInputs';
+import { FacetFilterInput, SearchSuggestion } from '../../types.generated';
+import { useUserContext } from '../context/useUserContext';
+
+const NoDataContainer = styled.div`
+ margin: 40px auto;
+ font-size: 16px;
+ color: ${ANTD_GRAY_V2[8]};
+`;
+
+const Section = styled.div`
+ margin-bottom: 16px;
+`;
+
+function getRefineSearchText(filters: FacetFilterInput[], viewUrn?: string | null) {
+ let text = '';
+ if (filters.length && viewUrn) {
+ text = 'clearing all filters and selected view';
+ } else if (filters.length) {
+ text = 'clearing all filters';
+ } else if (viewUrn) {
+ text = 'clearing the selected view';
+ }
+
+ return text;
+}
+
+interface Props {
+ suggestions: SearchSuggestion[];
+}
+
+export default function EmptySearchResults({ suggestions }: Props) {
+ const { query, filters, viewUrn } = useGetSearchQueryInputs();
+ const history = useHistory();
+ const userContext = useUserContext();
+ const suggestText = suggestions.length > 0 ? suggestions[0].text : '';
+ const refineSearchText = getRefineSearchText(filters, viewUrn);
+
+ const onClickExploreAll = useCallback(() => {
+ analytics.event({ type: EventType.SearchResultsExploreAllClickEvent });
+ navigateToSearchUrl({ query: '*', history });
+ }, [history]);
+
+ const searchForSuggestion = () => {
+ navigateToSearchUrl({ query: suggestText, history });
+ };
+
+ const clearFiltersAndView = () => {
+ navigateToSearchUrl({ query, history });
+ userContext.updateLocalState({
+ ...userContext.localState,
+ selectedViewUrn: undefined,
+ });
+ };
+
+ return (
+
+ No results found for "{query}"
+ {refineSearchText && (
+ <>
+ Try {refineSearchText} {' '}
+ {suggestText && (
+ <>
+ or searching for {suggestText}
+ >
+ )}
+ >
+ )}
+ {!refineSearchText && suggestText && (
+ <>
+ Did you mean {suggestText}
+ >
+ )}
+ {!refineSearchText && !suggestText && (
+
+ Explore all
+
+ )}
+
+ );
+}
diff --git a/datahub-web-react/src/app/search/EntityGroupSearchResults.tsx b/datahub-web-react/src/app/search/EntityGroupSearchResults.tsx
deleted file mode 100644
index 9b577048145c5e..00000000000000
--- a/datahub-web-react/src/app/search/EntityGroupSearchResults.tsx
+++ /dev/null
@@ -1,98 +0,0 @@
-import { ArrowRightOutlined } from '@ant-design/icons';
-import { Button, Card, Divider, List, Space, Typography } from 'antd';
-import { ListProps } from 'antd/lib/list';
-import * as React from 'react';
-import { useHistory } from 'react-router-dom';
-import styled from 'styled-components';
-import { EntityType, SearchResult } from '../../types.generated';
-import { IconStyleType } from '../entity/Entity';
-import { useEntityRegistry } from '../useEntityRegistry';
-import { navigateToSearchUrl } from './utils/navigateToSearchUrl';
-import analytics, { EventType } from '../analytics';
-
-const styles = {
- header: { marginBottom: 20 },
- resultHeaderCardBody: { padding: '16px 24px' },
- resultHeaderCard: { right: '52px', top: '-40px', position: 'absolute' },
- seeAllButton: { fontSize: 18 },
- resultsContainer: { width: '100%', padding: '40px 132px' },
-};
-
-const ResultList = styled(List)`
- &&& {
- width: 100%;
- border-color: ${(props) => props.theme.styles['border-color-base']};
- margin-top: 8px;
- padding: 16px 48px;
- box-shadow: ${(props) => props.theme.styles['box-shadow']};
- }
-`;
-
-interface Props {
- type: EntityType;
- query: string;
- searchResults: Array;
-}
-
-export const EntityGroupSearchResults = ({ type, query, searchResults }: Props) => {
- const history = useHistory();
- const entityRegistry = useEntityRegistry();
-
- const onResultClick = (result: SearchResult, index: number) => {
- analytics.event({
- type: EventType.SearchResultClickEvent,
- query,
- entityUrn: result.entity.urn,
- entityType: result.entity.type,
- index,
- total: searchResults.length,
- });
- };
-
- return (
-
- >>
- header={
-
- {entityRegistry.getCollectionName(type)}
-
- {entityRegistry.getIcon(type, 36, IconStyleType.ACCENT)}
-
-
- }
- footer={
- searchResults.length > 0 && (
-
- navigateToSearchUrl({
- type,
- query,
- page: 0,
- history,
- })
- }
- >
-
- See all {entityRegistry.getCollectionName(type)} results
-
-
-
- )
- }
- dataSource={searchResults as SearchResult[]}
- split={false}
- renderItem={(searchResult, index) => (
- <>
- onResultClick(searchResult, index)}>
- {entityRegistry.renderSearchResult(type, searchResult)}
-
- {index < searchResults.length - 1 && }
- >
- )}
- bordered
- />
-
- );
-};
diff --git a/datahub-web-react/src/app/search/PostLinkCard.tsx b/datahub-web-react/src/app/search/PostLinkCard.tsx
index 04308632c61c96..2111c0b25ad848 100644
--- a/datahub-web-react/src/app/search/PostLinkCard.tsx
+++ b/datahub-web-react/src/app/search/PostLinkCard.tsx
@@ -39,12 +39,17 @@ const TextContainer = styled.div`
flex: 2;
`;
-const TextWrapper = styled.div`
- text-align: left;
+const FlexWrapper = styled.div<{ alignCenter?: boolean }>`
display: flex;
flex-direction: column;
justify-content: center;
flex: 2;
+ ${(props) => props.alignCenter && 'align-items: center;'}
+`;
+
+const TextWrapper = styled.div`
+ display: flex;
+ flex-direction: column;
`;
const HeaderText = styled(Typography.Text)`
@@ -74,19 +79,21 @@ export const PostLinkCard = ({ linkPost }: Props) => {
const link = linkPost?.content?.link || '';
return (
-
+
{hasMedia && (
)}
-
- Link
-
- {linkPost?.content?.title}
-
-
+
+
+ Link
+
+ {linkPost?.content?.title}
+
+
+
diff --git a/datahub-web-react/src/app/search/PostTextCard.tsx b/datahub-web-react/src/app/search/PostTextCard.tsx
index 1bba55425fe0d0..15b34e37fc01cb 100644
--- a/datahub-web-react/src/app/search/PostTextCard.tsx
+++ b/datahub-web-react/src/app/search/PostTextCard.tsx
@@ -7,7 +7,6 @@ import { Post } from '../../types.generated';
const CardContainer = styled.div`
display: flex;
flex-direction: row;
- min-height: 140px;
border: 1px solid ${ANTD_GRAY[4]};
border-radius: 12px;
box-shadow: ${(props) => props.theme.styles['box-shadow']};
@@ -15,6 +14,7 @@ const CardContainer = styled.div`
box-shadow: ${(props) => props.theme.styles['box-shadow-hover']};
}
white-space: unset;
+ padding-bottom: 4px;
`;
const TextContainer = styled.div`
@@ -28,6 +28,9 @@ const TextContainer = styled.div`
const TitleText = styled(Typography.Title)`
word-break: break-word;
min-height: 20px;
+ &&& {
+ margin-top: 8px;
+ }
`;
const HeaderText = styled(Typography.Text)`
diff --git a/datahub-web-react/src/app/search/SearchBar.tsx b/datahub-web-react/src/app/search/SearchBar.tsx
index 7dbf3c55d021d3..fb10e1ca0026eb 100644
--- a/datahub-web-react/src/app/search/SearchBar.tsx
+++ b/datahub-web-react/src/app/search/SearchBar.tsx
@@ -377,7 +377,15 @@ export const SearchBar = ({
onKeyUp={handleStopPropagation}
onKeyDown={handleStopPropagation}
>
-
+
)}
{
orFilters,
viewUrn,
sortInput,
+ searchFlags: { getSuggestions: true },
},
},
});
@@ -235,6 +236,7 @@ export const SearchPage = () => {
error={error}
searchResponse={data?.searchAcrossEntities}
facets={data?.searchAcrossEntities?.facets}
+ suggestions={data?.searchAcrossEntities?.suggestions || []}
selectedFilters={filters}
loading={loading}
onChangeFilters={onChangeFilters}
diff --git a/datahub-web-react/src/app/search/SearchResultList.tsx b/datahub-web-react/src/app/search/SearchResultList.tsx
index 6e2d5c923c6e27..386b22f34602b1 100644
--- a/datahub-web-react/src/app/search/SearchResultList.tsx
+++ b/datahub-web-react/src/app/search/SearchResultList.tsx
@@ -1,18 +1,16 @@
-import React, { useCallback } from 'react';
-import { Button, Checkbox, Divider, Empty, List, ListProps } from 'antd';
+import React from 'react';
+import { Checkbox, Divider, List, ListProps } from 'antd';
import styled from 'styled-components';
-import { useHistory } from 'react-router';
-import { RocketOutlined } from '@ant-design/icons';
-import { navigateToSearchUrl } from './utils/navigateToSearchUrl';
import { ANTD_GRAY } from '../entity/shared/constants';
import { SEPARATE_SIBLINGS_URL_PARAM } from '../entity/shared/siblingUtils';
import { CompactEntityNameList } from '../recommendations/renderer/component/CompactEntityNameList';
import { useEntityRegistry } from '../useEntityRegistry';
-import { SearchResult } from '../../types.generated';
+import { SearchResult, SearchSuggestion } from '../../types.generated';
import analytics, { EventType } from '../analytics';
import { EntityAndType } from '../entity/shared/types';
import { useIsSearchV2 } from './useSearchAndBrowseVersion';
import { CombinedSearchResult } from './utils/combineSiblingsInSearchResults';
+import EmptySearchResults from './EmptySearchResults';
const ResultList = styled(List)`
&&& {
@@ -28,13 +26,6 @@ const StyledCheckbox = styled(Checkbox)`
margin-right: 12px;
`;
-const NoDataContainer = styled.div`
- > div {
- margin-top: 28px;
- margin-bottom: 28px;
- }
-`;
-
const ThinDivider = styled(Divider)`
margin-top: 16px;
margin-bottom: 16px;
@@ -70,6 +61,7 @@ type Props = {
isSelectMode: boolean;
selectedEntities: EntityAndType[];
setSelectedEntities: (entities: EntityAndType[]) => any;
+ suggestions: SearchSuggestion[];
};
export const SearchResultList = ({
@@ -79,17 +71,12 @@ export const SearchResultList = ({
isSelectMode,
selectedEntities,
setSelectedEntities,
+ suggestions,
}: Props) => {
- const history = useHistory();
const entityRegistry = useEntityRegistry();
const selectedEntityUrns = selectedEntities.map((entity) => entity.urn);
const showSearchFiltersV2 = useIsSearchV2();
- const onClickExploreAll = useCallback(() => {
- analytics.event({ type: EventType.SearchResultsExploreAllClickEvent });
- navigateToSearchUrl({ query: '*', history });
- }, [history]);
-
const onClickResult = (result: SearchResult, index: number) => {
analytics.event({
type: EventType.SearchResultClickEvent,
@@ -118,19 +105,7 @@ export const SearchResultList = ({
id="search-result-list"
dataSource={searchResults}
split={false}
- locale={{
- emptyText: (
-
-
-
- Explore all
-
-
- ),
- }}
+ locale={{ emptyText: }}
renderItem={(item, index) => (
`
display: flex;
@@ -131,6 +132,7 @@ interface Props {
setNumResultsPerPage: (numResults: number) => void;
isSelectMode: boolean;
selectedEntities: EntityAndType[];
+ suggestions: SearchSuggestion[];
setSelectedEntities: (entities: EntityAndType[]) => void;
setIsSelectMode: (showSelectMode: boolean) => any;
onChangeSelectAll: (selected: boolean) => void;
@@ -155,6 +157,7 @@ export const SearchResults = ({
setNumResultsPerPage,
isSelectMode,
selectedEntities,
+ suggestions,
setIsSelectMode,
setSelectedEntities,
onChangeSelectAll,
@@ -238,6 +241,7 @@ export const SearchResults = ({
{(error && ) ||
(!loading && (
+ {totalResults > 0 && }
-
+ {totalResults > 0 && (
+
+ )}
{authenticatedUserUrn && (
void;
};
export const DEFAULT_CONTEXT = {
+ query: undefined,
selectedSortOption: undefined,
setSelectedSortOption: (_: string) => null,
};
@@ -21,3 +23,7 @@ export function useSearchContext() {
export function useSelectedSortOption() {
return useSearchContext().selectedSortOption;
}
+
+export function useSearchQuery() {
+ return useSearchContext().query;
+}
diff --git a/datahub-web-react/src/app/search/context/SearchContextProvider.tsx b/datahub-web-react/src/app/search/context/SearchContextProvider.tsx
index bfb65c1d74d3ee..5ad9667ab1fc09 100644
--- a/datahub-web-react/src/app/search/context/SearchContextProvider.tsx
+++ b/datahub-web-react/src/app/search/context/SearchContextProvider.tsx
@@ -8,6 +8,7 @@ export default function SearchContextProvider({ children }: { children: React.Re
const history = useHistory();
const location = useLocation();
const params = useMemo(() => QueryString.parse(location.search, { arrayFormat: 'comma' }), [location.search]);
+ const query = (params.query ? decodeURIComponent(params.query as string) : undefined) as string | undefined;
const selectedSortOption = params.sortOption as string | undefined;
function setSelectedSortOption(selectedOption: string) {
@@ -15,7 +16,7 @@ export default function SearchContextProvider({ children }: { children: React.Re
}
return (
-
+
{children}
);
diff --git a/datahub-web-react/src/app/search/context/SearchResultContext.tsx b/datahub-web-react/src/app/search/context/SearchResultContext.tsx
new file mode 100644
index 00000000000000..68adead0051492
--- /dev/null
+++ b/datahub-web-react/src/app/search/context/SearchResultContext.tsx
@@ -0,0 +1,72 @@
+import React, { ReactNode, createContext, useContext, useMemo } from 'react';
+import { SearchResult } from '../../../types.generated';
+import {
+ getMatchedFieldsByUrn,
+ getMatchedFieldNames,
+ getMatchedFieldsByNames,
+ shouldShowInMatchedFieldList,
+ getMatchedFieldLabel,
+ getMatchesPrioritized,
+} from '../matches/utils';
+import { MatchedFieldName } from '../matches/constants';
+
+type SearchResultContextValue = {
+ searchResult: SearchResult;
+} | null;
+
+const SearchResultContext = createContext(null);
+
+type Props = {
+ children: ReactNode;
+ searchResult: SearchResult;
+};
+
+export const SearchResultProvider = ({ children, searchResult }: Props) => {
+ const value = useMemo(
+ () => ({
+ searchResult,
+ }),
+ [searchResult],
+ );
+ return {children} ;
+};
+
+const useSearchResultContext = () => {
+ return useContext(SearchResultContext);
+};
+
+export const useSearchResult = () => {
+ return useSearchResultContext()?.searchResult;
+};
+
+export const useEntityType = () => {
+ return useSearchResultContext()?.searchResult.entity.type;
+};
+
+export const useMatchedFields = () => {
+ return useSearchResult()?.matchedFields ?? [];
+};
+
+export const useMatchedFieldsForList = (primaryField: MatchedFieldName) => {
+ const entityType = useEntityType();
+ const matchedFields = useMatchedFields();
+ const showableFields = matchedFields.filter((field) => shouldShowInMatchedFieldList(entityType, field));
+ return entityType ? getMatchesPrioritized(entityType, showableFields, primaryField) : [];
+};
+
+export const useMatchedFieldsByGroup = (fieldName: MatchedFieldName) => {
+ const entityType = useEntityType();
+ const matchedFields = useMatchedFields();
+ const matchedFieldNames = getMatchedFieldNames(entityType, fieldName);
+ return getMatchedFieldsByNames(matchedFields, matchedFieldNames);
+};
+
+export const useHasMatchedFieldByUrn = (urn: string, fieldName: MatchedFieldName) => {
+ const matchedFields = useMatchedFieldsByGroup(fieldName);
+ return getMatchedFieldsByUrn(matchedFields, urn).length > 0;
+};
+
+export const useMatchedFieldLabel = (fieldName: string) => {
+ const entityType = useEntityType();
+ return getMatchedFieldLabel(entityType, fieldName);
+};
diff --git a/datahub-web-react/src/app/search/context/constants.ts b/datahub-web-react/src/app/search/context/constants.ts
index 372230db023e9d..5f841b8536e196 100644
--- a/datahub-web-react/src/app/search/context/constants.ts
+++ b/datahub-web-react/src/app/search/context/constants.ts
@@ -1,15 +1,23 @@
import { SortOrder } from '../../../types.generated';
export const RELEVANCE = 'relevance';
-export const NAME_FIELD = 'name';
+export const ENTITY_NAME_FIELD = '_entityName';
export const LAST_OPERATION_TIME_FIELD = 'lastOperationTime';
export const DEFAULT_SORT_OPTION = RELEVANCE;
export const SORT_OPTIONS = {
[RELEVANCE]: { label: 'Relevance', field: RELEVANCE, sortOrder: SortOrder.Descending },
- [`${NAME_FIELD}_${SortOrder.Ascending}`]: { label: 'A to Z', field: NAME_FIELD, sortOrder: SortOrder.Ascending },
- [`${NAME_FIELD}_${SortOrder.Descending}`]: { label: 'Z to A', field: NAME_FIELD, sortOrder: SortOrder.Descending },
+ [`${ENTITY_NAME_FIELD}_${SortOrder.Ascending}`]: {
+ label: 'A to Z',
+ field: ENTITY_NAME_FIELD,
+ sortOrder: SortOrder.Ascending,
+ },
+ [`${ENTITY_NAME_FIELD}_${SortOrder.Descending}`]: {
+ label: 'Z to A',
+ field: ENTITY_NAME_FIELD,
+ sortOrder: SortOrder.Descending,
+ },
[`${LAST_OPERATION_TIME_FIELD}_${SortOrder.Descending}`]: {
label: 'Last Modified in Platform',
field: LAST_OPERATION_TIME_FIELD,
diff --git a/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx b/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx
new file mode 100644
index 00000000000000..0bfe000dea3663
--- /dev/null
+++ b/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx
@@ -0,0 +1,133 @@
+import React from 'react';
+
+import { Tooltip, Typography } from 'antd';
+import styled from 'styled-components';
+import { useMatchedFieldLabel, useMatchedFieldsForList } from '../context/SearchResultContext';
+import { MatchedField } from '../../../types.generated';
+import { ANTD_GRAY_V2 } from '../../entity/shared/constants';
+import { useSearchQuery } from '../context/SearchContext';
+import { MatchesGroupedByFieldName } from './constants';
+import { useEntityRegistry } from '../../useEntityRegistry';
+import { getDescriptionSlice, isDescriptionField, isHighlightableEntityField } from './utils';
+
+const MatchesContainer = styled.div`
+ display: flex;
+ flex-wrap: wrap;
+ gap: 8px;
+`;
+
+const MatchText = styled(Typography.Text)`
+ color: ${ANTD_GRAY_V2[8]};
+ background: ${(props) => props.theme.styles['highlight-color']};
+ border-radius: 4px;
+ padding: 2px 4px 2px 4px;
+ padding-right: 4px;
+`;
+
+const MATCH_GROUP_LIMIT = 3;
+const TOOLTIP_MATCH_GROUP_LIMIT = 10;
+
+type CustomFieldRenderer = (field: MatchedField) => JSX.Element | null;
+
+type Props = {
+ customFieldRenderer?: CustomFieldRenderer;
+ matchSuffix?: string;
+};
+
+const RenderedField = ({
+ customFieldRenderer,
+ field,
+}: {
+ customFieldRenderer?: CustomFieldRenderer;
+ field: MatchedField;
+}) => {
+ const entityRegistry = useEntityRegistry();
+ const query = useSearchQuery()?.trim().toLowerCase();
+ const customRenderedField = customFieldRenderer?.(field);
+ if (customRenderedField) return {customRenderedField} ;
+ if (isHighlightableEntityField(field)) {
+ return field.entity ? <>{entityRegistry.getDisplayName(field.entity.type, field.entity)}> : <>>;
+ }
+ if (isDescriptionField(field) && query) return {getDescriptionSlice(field.value, query)} ;
+ return {field.value} ;
+};
+
+const MatchedFieldsList = ({
+ groupedMatch,
+ limit,
+ tooltip,
+ matchSuffix = '',
+ customFieldRenderer,
+}: {
+ groupedMatch: MatchesGroupedByFieldName;
+ limit: number;
+ tooltip?: JSX.Element;
+ matchSuffix?: string;
+ customFieldRenderer?: CustomFieldRenderer;
+}) => {
+ const label = useMatchedFieldLabel(groupedMatch.fieldName);
+ const count = groupedMatch.matchedFields.length;
+ const moreCount = Math.max(count - limit, 0);
+ const andMore = (
+ <>
+ {' '}
+ & more
+ >
+ );
+ return (
+ <>
+ Matches {count > 1 && `${count} `}
+ {label}
+ {count > 1 && 's'}{' '}
+ {groupedMatch.matchedFields.slice(0, limit).map((field, index) => (
+ <>
+ {index > 0 && ', '}
+ <>
+
+ >
+ >
+ ))}
+ {moreCount > 0 &&
+ (tooltip ? (
+
+ {andMore}
+
+ ) : (
+ <>{andMore}>
+ ))}{' '}
+ {matchSuffix}
+ >
+ );
+};
+
+export const MatchedFieldList = ({ customFieldRenderer, matchSuffix = '' }: Props) => {
+ const groupedMatches = useMatchedFieldsForList('fieldLabels');
+
+ return (
+ <>
+ {groupedMatches.length > 0 ? (
+
+ {groupedMatches.map((groupedMatch) => {
+ return (
+
+
+ }
+ />
+
+ );
+ })}
+
+ ) : null}
+ >
+ );
+};
diff --git a/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx b/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx
new file mode 100644
index 00000000000000..d8da1088ea89d1
--- /dev/null
+++ b/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx
@@ -0,0 +1,42 @@
+import React from 'react';
+import Highlight from 'react-highlighter';
+import styled from 'styled-components';
+import { useMatchedFieldsByGroup } from '../context/SearchResultContext';
+import { useSearchQuery } from '../context/SearchContext';
+import { MatchedFieldName } from './constants';
+import { useAppConfig } from '../../useAppConfig';
+
+type Props = {
+ field: MatchedFieldName;
+ text: string;
+ enableFullHighlight?: boolean;
+};
+
+const HIGHLIGHT_ALL_PATTERN = /.*/;
+
+const StyledHighlight = styled(Highlight).attrs((props) => ({
+ matchStyle: { background: props.theme.styles['highlight-color'] },
+}))``;
+
+const SearchTextHighlighter = ({ field, text, enableFullHighlight = false }: Props) => {
+ const appConfig = useAppConfig();
+ const enableNameHighlight = appConfig.config.visualConfig.searchResult?.enableNameHighlight;
+ const matchedFields = useMatchedFieldsByGroup(field);
+ const hasMatchedField = !!matchedFields?.length;
+ const normalizedSearchQuery = useSearchQuery()?.trim().toLowerCase();
+ const normalizedText = text.trim().toLowerCase();
+ const hasSubstring = hasMatchedField && !!normalizedSearchQuery && normalizedText.includes(normalizedSearchQuery);
+ const pattern = enableFullHighlight ? HIGHLIGHT_ALL_PATTERN : undefined;
+
+ return (
+ <>
+ {enableNameHighlight && hasMatchedField ? (
+ {text}
+ ) : (
+ text
+ )}
+ >
+ );
+};
+
+export default SearchTextHighlighter;
diff --git a/datahub-web-react/src/app/search/matches/constants.ts b/datahub-web-react/src/app/search/matches/constants.ts
new file mode 100644
index 00000000000000..25ca82eef95978
--- /dev/null
+++ b/datahub-web-react/src/app/search/matches/constants.ts
@@ -0,0 +1,129 @@
+import { EntityType, MatchedField } from '../../../types.generated';
+
+export type MatchedFieldName =
+ | 'urn'
+ | 'name'
+ | 'displayName'
+ | 'title'
+ | 'description'
+ | 'editedDescription'
+ | 'editedFieldDescriptions'
+ | 'fieldDescriptions'
+ | 'tags'
+ | 'fieldTags'
+ | 'editedFieldTags'
+ | 'glossaryTerms'
+ | 'fieldGlossaryTerms'
+ | 'editedFieldGlossaryTerms'
+ | 'fieldLabels'
+ | 'fieldPaths';
+
+export type MatchedFieldConfig = {
+ name: MatchedFieldName;
+ groupInto?: MatchedFieldName;
+ label: string;
+ showInMatchedFieldList?: boolean;
+};
+
+const DEFAULT_MATCHED_FIELD_CONFIG: Array = [
+ {
+ name: 'urn',
+ label: 'urn',
+ },
+ {
+ name: 'title',
+ label: 'title',
+ },
+ {
+ name: 'displayName',
+ groupInto: 'name',
+ label: 'display name',
+ },
+ {
+ name: 'name',
+ groupInto: 'name',
+ label: 'name',
+ },
+ {
+ name: 'editedDescription',
+ groupInto: 'description',
+ label: 'description',
+ },
+ {
+ name: 'description',
+ groupInto: 'description',
+ label: 'description',
+ },
+ {
+ name: 'editedFieldDescriptions',
+ groupInto: 'fieldDescriptions',
+ label: 'column description',
+ showInMatchedFieldList: true,
+ },
+ {
+ name: 'fieldDescriptions',
+ groupInto: 'fieldDescriptions',
+ label: 'column description',
+ showInMatchedFieldList: true,
+ },
+ {
+ name: 'tags',
+ label: 'tag',
+ },
+ {
+ name: 'editedFieldTags',
+ groupInto: 'fieldTags',
+ label: 'column tag',
+ showInMatchedFieldList: true,
+ },
+ {
+ name: 'fieldTags',
+ groupInto: 'fieldTags',
+ label: 'column tag',
+ showInMatchedFieldList: true,
+ },
+ {
+ name: 'glossaryTerms',
+ label: 'term',
+ },
+ {
+ name: 'editedFieldGlossaryTerms',
+ groupInto: 'fieldGlossaryTerms',
+ label: 'column term',
+ showInMatchedFieldList: true,
+ },
+ {
+ name: 'fieldGlossaryTerms',
+ groupInto: 'fieldGlossaryTerms',
+ label: 'column term',
+ showInMatchedFieldList: true,
+ },
+ {
+ name: 'fieldLabels',
+ label: 'label',
+ showInMatchedFieldList: true,
+ },
+ {
+ name: 'fieldPaths',
+ label: 'column',
+ showInMatchedFieldList: true,
+ },
+];
+
+export const CHART_DASHBOARD_FIELD_CONFIG: Array = DEFAULT_MATCHED_FIELD_CONFIG.map((config) => {
+ if (config.name === 'title') return { ...config, groupInto: 'name' };
+ return config;
+});
+
+export const MATCHED_FIELD_CONFIG = {
+ [EntityType.Chart]: CHART_DASHBOARD_FIELD_CONFIG,
+ [EntityType.Dashboard]: CHART_DASHBOARD_FIELD_CONFIG,
+ DEFAULT: DEFAULT_MATCHED_FIELD_CONFIG,
+} as const;
+
+export type MatchesGroupedByFieldName = {
+ fieldName: string;
+ matchedFields: Array;
+};
+
+export const HIGHLIGHTABLE_ENTITY_TYPES = [EntityType.Tag, EntityType.GlossaryTerm];
diff --git a/datahub-web-react/src/app/search/matches/matchedFieldPathsRenderer.tsx b/datahub-web-react/src/app/search/matches/matchedFieldPathsRenderer.tsx
new file mode 100644
index 00000000000000..0a33530552864f
--- /dev/null
+++ b/datahub-web-react/src/app/search/matches/matchedFieldPathsRenderer.tsx
@@ -0,0 +1,8 @@
+import React from 'react';
+
+import { MatchedField } from '../../../types.generated';
+import { downgradeV2FieldPath } from '../../entity/dataset/profile/schema/utils/utils';
+
+export const matchedFieldPathsRenderer = (matchedField: MatchedField) => {
+ return matchedField?.name === 'fieldPaths' ? {downgradeV2FieldPath(matchedField.value)} : null;
+};
diff --git a/datahub-web-react/src/app/search/matches/matchedInputFieldRenderer.tsx b/datahub-web-react/src/app/search/matches/matchedInputFieldRenderer.tsx
new file mode 100644
index 00000000000000..25634c9e8b80e7
--- /dev/null
+++ b/datahub-web-react/src/app/search/matches/matchedInputFieldRenderer.tsx
@@ -0,0 +1,40 @@
+import React from 'react';
+
+import { Chart, Dashboard, EntityType, GlossaryTerm, MatchedField } from '../../../types.generated';
+import { useEntityRegistry } from '../../useEntityRegistry';
+
+const LABEL_INDEX_NAME = 'fieldLabels';
+const TYPE_PROPERTY_KEY_NAME = 'type';
+
+const TermName = ({ term }: { term: GlossaryTerm }) => {
+ const entityRegistry = useEntityRegistry();
+ return <>{entityRegistry.getDisplayName(EntityType.GlossaryTerm, term)}>;
+};
+
+export const matchedInputFieldRenderer = (matchedField: MatchedField, entity: Chart | Dashboard) => {
+ if (matchedField?.name === LABEL_INDEX_NAME) {
+ const matchedSchemaField = entity.inputFields?.fields?.find(
+ (field) => field?.schemaField?.label === matchedField.value,
+ );
+ const matchedGlossaryTerm = matchedSchemaField?.schemaField?.glossaryTerms?.terms?.find(
+ (term) => term?.term?.name === matchedField.value,
+ );
+
+ if (matchedGlossaryTerm) {
+ let termType = 'term';
+ const typeProperty = matchedGlossaryTerm.term.properties?.customProperties?.find(
+ (property) => property.key === TYPE_PROPERTY_KEY_NAME,
+ );
+ if (typeProperty) {
+ termType = typeProperty.value || termType;
+ }
+
+ return (
+ <>
+ {termType}
+ >
+ );
+ }
+ }
+ return null;
+};
diff --git a/datahub-web-react/src/app/search/matches/utils.test.ts b/datahub-web-react/src/app/search/matches/utils.test.ts
new file mode 100644
index 00000000000000..8b5ed27f5c2ad6
--- /dev/null
+++ b/datahub-web-react/src/app/search/matches/utils.test.ts
@@ -0,0 +1,110 @@
+import { EntityType } from '../../../types.generated';
+import { getMatchesPrioritized } from './utils';
+
+const mapping = new Map();
+mapping.set('fieldPaths', 'column');
+mapping.set('fieldDescriptions', 'column description');
+mapping.set('fieldTags', 'column tag');
+
+const MOCK_MATCHED_FIELDS = [
+ {
+ name: 'fieldPaths',
+ value: 'rain',
+ },
+ {
+ name: 'fieldDescriptions',
+ value: 'rainbow',
+ },
+ {
+ name: 'fieldPaths',
+ value: 'rainbow',
+ },
+ {
+ name: 'fieldPaths',
+ value: 'rainbows',
+ },
+];
+
+const MOCK_MATCHED_DESCRIPTION_FIELDS = [
+ {
+ name: 'editedDescription',
+ value: 'edited description value',
+ },
+ {
+ name: 'description',
+ value: 'description value',
+ },
+ {
+ name: 'fieldDescriptions',
+ value: 'field descriptions value',
+ },
+ {
+ name: 'editedFieldDescriptions',
+ value: 'edited field descriptions value',
+ },
+];
+
+describe('utils', () => {
+ describe('getMatchPrioritizingPrimary', () => {
+ it('prioritizes exact match', () => {
+ global.window.location.search = 'query=rainbow';
+ const groupedMatches = getMatchesPrioritized(EntityType.Dataset, MOCK_MATCHED_FIELDS, 'fieldPaths');
+ expect(groupedMatches).toEqual([
+ {
+ fieldName: 'fieldPaths',
+ matchedFields: [
+ { name: 'fieldPaths', value: 'rainbow' },
+ { name: 'fieldPaths', value: 'rainbows' },
+ { name: 'fieldPaths', value: 'rain' },
+ ],
+ },
+ {
+ fieldName: 'fieldDescriptions',
+ matchedFields: [{ name: 'fieldDescriptions', value: 'rainbow' }],
+ },
+ ]);
+ });
+ it('will accept first contains match', () => {
+ global.window.location.search = 'query=bow';
+ const groupedMatches = getMatchesPrioritized(EntityType.Dataset, MOCK_MATCHED_FIELDS, 'fieldPaths');
+ expect(groupedMatches).toEqual([
+ {
+ fieldName: 'fieldPaths',
+ matchedFields: [
+ { name: 'fieldPaths', value: 'rainbow' },
+ { name: 'fieldPaths', value: 'rainbows' },
+ { name: 'fieldPaths', value: 'rain' },
+ ],
+ },
+ {
+ fieldName: 'fieldDescriptions',
+ matchedFields: [{ name: 'fieldDescriptions', value: 'rainbow' }],
+ },
+ ]);
+ });
+ it('will group by field name', () => {
+ global.window.location.search = '';
+ const groupedMatches = getMatchesPrioritized(
+ EntityType.Dataset,
+ MOCK_MATCHED_DESCRIPTION_FIELDS,
+ 'fieldPaths',
+ );
+ expect(groupedMatches).toEqual([
+ {
+ fieldName: 'description',
+ matchedFields: [
+ { name: 'editedDescription', value: 'edited description value' },
+ { name: 'description', value: 'description value' },
+ ],
+ },
+ {
+ fieldName: 'fieldDescriptions',
+ matchedFields: [
+ { name: 'fieldDescriptions', value: 'field descriptions value' },
+ { name: 'editedFieldDescriptions', value: 'edited field descriptions value' },
+ ],
+ },
+ ]);
+ });
+ });
+});
diff --git a/datahub-web-react/src/app/search/matches/utils.ts b/datahub-web-react/src/app/search/matches/utils.ts
new file mode 100644
index 00000000000000..78c62f7eef4588
--- /dev/null
+++ b/datahub-web-react/src/app/search/matches/utils.ts
@@ -0,0 +1,136 @@
+import * as QueryString from 'query-string';
+import { EntityType, MatchedField } from '../../../types.generated';
+import {
+ HIGHLIGHTABLE_ENTITY_TYPES,
+ MATCHED_FIELD_CONFIG,
+ MatchedFieldConfig,
+ MatchedFieldName,
+ MatchesGroupedByFieldName,
+} from './constants';
+
+const getFieldConfigsByEntityType = (entityType: EntityType | undefined): Array => {
+ return entityType && entityType in MATCHED_FIELD_CONFIG
+ ? MATCHED_FIELD_CONFIG[entityType]
+ : MATCHED_FIELD_CONFIG.DEFAULT;
+};
+
+export const shouldShowInMatchedFieldList = (entityType: EntityType | undefined, field: MatchedField): boolean => {
+ const configs = getFieldConfigsByEntityType(entityType);
+ return configs.some((config) => config.name === field.name && config.showInMatchedFieldList);
+};
+
+export const getMatchedFieldLabel = (entityType: EntityType | undefined, fieldName: string): string => {
+ const configs = getFieldConfigsByEntityType(entityType);
+ return configs.find((config) => config.name === fieldName)?.label ?? '';
+};
+
+export const getGroupedFieldName = (
+ entityType: EntityType | undefined,
+ fieldName: string,
+): MatchedFieldName | undefined => {
+ const configs = getFieldConfigsByEntityType(entityType);
+ const fieldConfig = configs.find((config) => config.name === fieldName);
+ return fieldConfig?.groupInto;
+};
+
+export const getMatchedFieldNames = (
+ entityType: EntityType | undefined,
+ fieldName: MatchedFieldName,
+): Array => {
+ return getFieldConfigsByEntityType(entityType)
+ .filter((config) => fieldName === config.groupInto || fieldName === config.name)
+ .map((field) => field.name);
+};
+
+export const getMatchedFieldsByNames = (fields: Array, names: Array): Array => {
+ return fields.filter((field) => names.includes(field.name));
+};
+
+export const getMatchedFieldsByUrn = (fields: Array, urn: string): Array => {
+ return fields.filter((field) => field.value === urn);
+};
+
+function normalize(value: string) {
+ return value.trim().toLowerCase();
+}
+
+function fromQueryGetBestMatch(
+ selectedMatchedFields: MatchedField[],
+ rawQuery: string,
+ prioritizedField: string,
+): Array {
+ const query = normalize(rawQuery);
+ const priorityMatches: Array = selectedMatchedFields.filter(
+ (field) => field.name === prioritizedField,
+ );
+ const nonPriorityMatches: Array = selectedMatchedFields.filter(
+ (field) => field.name !== prioritizedField,
+ );
+ const exactMatches: Array = [];
+ const containedMatches: Array = [];
+ const rest: Array = [];
+
+ [...priorityMatches, ...nonPriorityMatches].forEach((field) => {
+ const normalizedValue = normalize(field.value);
+ if (normalizedValue === query) exactMatches.push(field);
+ else if (normalizedValue.includes(query)) containedMatches.push(field);
+ else rest.push(field);
+ });
+
+ return [...exactMatches, ...containedMatches, ...rest];
+}
+
+const getMatchesGroupedByFieldName = (
+ entityType: EntityType,
+ matchedFields: Array,
+): Array => {
+ const fieldNameToMatches = new Map>();
+ const fieldNames: Array = [];
+ matchedFields.forEach((field) => {
+ const groupedFieldName = getGroupedFieldName(entityType, field.name) || field.name;
+ const matchesInMap = fieldNameToMatches.get(groupedFieldName);
+ if (matchesInMap) {
+ matchesInMap.push(field);
+ } else {
+ fieldNameToMatches.set(groupedFieldName, [field]);
+ fieldNames.push(groupedFieldName);
+ }
+ });
+ return fieldNames.map((fieldName) => ({
+ fieldName,
+ matchedFields: fieldNameToMatches.get(fieldName) ?? [],
+ }));
+};
+
+export const getMatchesPrioritized = (
+ entityType: EntityType,
+ matchedFields: MatchedField[],
+ prioritizedField: string,
+): Array => {
+ const { location } = window;
+ const params = QueryString.parse(location.search, { arrayFormat: 'comma' });
+ const query: string = decodeURIComponent(params.query ? (params.query as string) : '');
+ const matches = fromQueryGetBestMatch(matchedFields, query, prioritizedField);
+ return getMatchesGroupedByFieldName(entityType, matches);
+};
+
+export const isHighlightableEntityField = (field: MatchedField) =>
+ !!field.entity && HIGHLIGHTABLE_ENTITY_TYPES.includes(field.entity.type);
+
+export const isDescriptionField = (field: MatchedField) => field.name.toLowerCase().includes('description');
+
+const SURROUNDING_DESCRIPTION_CHARS = 10;
+const MAX_DESCRIPTION_CHARS = 50;
+
+export const getDescriptionSlice = (text: string, target: string) => {
+ const queryIndex = text.indexOf(target);
+ const start = Math.max(0, queryIndex - SURROUNDING_DESCRIPTION_CHARS);
+ const end = Math.min(
+ start + MAX_DESCRIPTION_CHARS,
+ text.length,
+ queryIndex + target.length + SURROUNDING_DESCRIPTION_CHARS,
+ );
+ const startEllipsis = start > 0 ? '...' : '';
+ const endEllipsis = end < text.length ? '...' : '';
+ return `${startEllipsis}${text.slice(start, end)}${endEllipsis}`;
+};
diff --git a/datahub-web-react/src/app/search/suggestions/SearchQuerySugggester.tsx b/datahub-web-react/src/app/search/suggestions/SearchQuerySugggester.tsx
new file mode 100644
index 00000000000000..9dbd67883bf642
--- /dev/null
+++ b/datahub-web-react/src/app/search/suggestions/SearchQuerySugggester.tsx
@@ -0,0 +1,39 @@
+import styled from 'styled-components';
+import React from 'react';
+import { useHistory } from 'react-router';
+import { SearchSuggestion } from '../../../types.generated';
+import { navigateToSearchUrl } from '../utils/navigateToSearchUrl';
+import { ANTD_GRAY_V2 } from '../../entity/shared/constants';
+
+const TextWrapper = styled.div`
+ font-size: 14px;
+ color: ${ANTD_GRAY_V2[8]};
+ margin: 16px 0 -8px 32px;
+`;
+
+export const SuggestedText = styled.span`
+ color: ${(props) => props.theme.styles['primary-color']};
+ text-decoration: underline ${(props) => props.theme.styles['primary-color']};
+ cursor: pointer;
+`;
+
+interface Props {
+ suggestions: SearchSuggestion[];
+}
+
+export default function SearchQuerySuggester({ suggestions }: Props) {
+ const history = useHistory();
+
+ if (suggestions.length === 0) return null;
+ const suggestText = suggestions[0].text;
+
+ function searchForSuggestion() {
+ navigateToSearchUrl({ query: suggestText, history });
+ }
+
+ return (
+
+ Did you mean {suggestText}
+
+ );
+}
diff --git a/datahub-web-react/src/app/settings/SettingsPage.tsx b/datahub-web-react/src/app/settings/SettingsPage.tsx
index bfec9b395cff21..339cc0cf44bace 100644
--- a/datahub-web-react/src/app/settings/SettingsPage.tsx
+++ b/datahub-web-react/src/app/settings/SettingsPage.tsx
@@ -7,6 +7,7 @@ import {
ToolOutlined,
FilterOutlined,
TeamOutlined,
+ PushpinOutlined,
} from '@ant-design/icons';
import { Redirect, Route, useHistory, useLocation, useRouteMatch, Switch } from 'react-router';
import styled from 'styled-components';
@@ -19,6 +20,7 @@ import { Preferences } from './Preferences';
import { ManageViews } from '../entity/view/ManageViews';
import { useUserContext } from '../context/useUserContext';
import { ManageOwnership } from '../entity/ownership/ManageOwnership';
+import ManagePosts from './posts/ManagePosts';
const PageContainer = styled.div`
display: flex;
@@ -62,6 +64,7 @@ const PATHS = [
{ path: 'preferences', content: },
{ path: 'views', content: },
{ path: 'ownership', content: },
+ { path: 'posts', content: },
];
/**
@@ -91,6 +94,7 @@ export const SettingsPage = () => {
const showUsersGroups = (isIdentityManagementEnabled && me && me?.platformPrivileges?.manageIdentities) || false;
const showViews = isViewsEnabled || false;
const showOwnershipTypes = me && me?.platformPrivileges?.manageOwnershipTypes;
+ const showHomePagePosts = me && me?.platformPrivileges?.manageGlobalAnnouncements;
return (
@@ -143,6 +147,11 @@ export const SettingsPage = () => {
Ownership Types
)}
+ {showHomePagePosts && (
+
+ Home Page Posts
+
+ )}
diff --git a/datahub-web-react/src/app/settings/posts/CreatePostForm.tsx b/datahub-web-react/src/app/settings/posts/CreatePostForm.tsx
new file mode 100644
index 00000000000000..a8d6cfa64c9c14
--- /dev/null
+++ b/datahub-web-react/src/app/settings/posts/CreatePostForm.tsx
@@ -0,0 +1,91 @@
+import React, { useState } from 'react';
+import { Form, Input, Typography, FormInstance, Radio } from 'antd';
+import styled from 'styled-components';
+import {
+ DESCRIPTION_FIELD_NAME,
+ LINK_FIELD_NAME,
+ LOCATION_FIELD_NAME,
+ TITLE_FIELD_NAME,
+ TYPE_FIELD_NAME,
+} from './constants';
+import { PostContentType } from '../../../types.generated';
+
+const TopFormItem = styled(Form.Item)`
+ margin-bottom: 24px;
+`;
+
+const SubFormItem = styled(Form.Item)`
+ margin-bottom: 0;
+`;
+
+type Props = {
+ setCreateButtonEnabled: (isEnabled: boolean) => void;
+ form: FormInstance;
+};
+
+export default function CreatePostForm({ setCreateButtonEnabled, form }: Props) {
+ const [postType, setPostType] = useState(PostContentType.Text);
+
+ return (
+
+ );
+}
diff --git a/datahub-web-react/src/app/settings/posts/CreatePostModal.tsx b/datahub-web-react/src/app/settings/posts/CreatePostModal.tsx
new file mode 100644
index 00000000000000..b4851ecb029693
--- /dev/null
+++ b/datahub-web-react/src/app/settings/posts/CreatePostModal.tsx
@@ -0,0 +1,107 @@
+import React, { useState } from 'react';
+import { Button, Form, message, Modal } from 'antd';
+import CreatePostForm from './CreatePostForm';
+import {
+ CREATE_POST_BUTTON_ID,
+ DESCRIPTION_FIELD_NAME,
+ LINK_FIELD_NAME,
+ LOCATION_FIELD_NAME,
+ TYPE_FIELD_NAME,
+ TITLE_FIELD_NAME,
+} from './constants';
+import { useEnterKeyListener } from '../../shared/useEnterKeyListener';
+import { MediaType, PostContentType, PostType } from '../../../types.generated';
+import { useCreatePostMutation } from '../../../graphql/mutations.generated';
+
+type Props = {
+ onClose: () => void;
+ onCreate: (
+ contentType: string,
+ title: string,
+ description: string | undefined,
+ link: string | undefined,
+ location: string | undefined,
+ ) => void;
+};
+
+export default function CreatePostModal({ onClose, onCreate }: Props) {
+ const [createPostMutation] = useCreatePostMutation();
+ const [createButtonEnabled, setCreateButtonEnabled] = useState(false);
+ const [form] = Form.useForm();
+ const onCreatePost = () => {
+ const contentTypeValue = form.getFieldValue(TYPE_FIELD_NAME) ?? PostContentType.Text;
+ const mediaValue =
+ form.getFieldValue(TYPE_FIELD_NAME) && form.getFieldValue(LOCATION_FIELD_NAME)
+ ? {
+ type: MediaType.Image,
+ location: form.getFieldValue(LOCATION_FIELD_NAME) ?? null,
+ }
+ : null;
+ createPostMutation({
+ variables: {
+ input: {
+ postType: PostType.HomePageAnnouncement,
+ content: {
+ contentType: contentTypeValue,
+ title: form.getFieldValue(TITLE_FIELD_NAME),
+ description: form.getFieldValue(DESCRIPTION_FIELD_NAME) ?? null,
+ link: form.getFieldValue(LINK_FIELD_NAME) ?? null,
+ media: mediaValue,
+ },
+ },
+ },
+ })
+ .then(({ errors }) => {
+ if (!errors) {
+ message.success({
+ content: `Created Post!`,
+ duration: 3,
+ });
+ onCreate(
+ form.getFieldValue(TYPE_FIELD_NAME) ?? PostContentType.Text,
+ form.getFieldValue(TITLE_FIELD_NAME),
+ form.getFieldValue(DESCRIPTION_FIELD_NAME),
+ form.getFieldValue(LINK_FIELD_NAME),
+ form.getFieldValue(LOCATION_FIELD_NAME),
+ );
+ form.resetFields();
+ }
+ })
+ .catch((e) => {
+ message.destroy();
+ message.error({ content: 'Failed to create Post! An unknown error occured.', duration: 3 });
+ console.error('Failed to create Post:', e.message);
+ });
+ onClose();
+ };
+
+ // Handle the Enter press
+ useEnterKeyListener({
+ querySelectorToExecuteClick: '#createPostButton',
+ });
+
+ return (
+
+
+ Cancel
+
+
+ Create
+
+ >
+ }
+ >
+
+
+ );
+}
diff --git a/datahub-web-react/src/app/settings/posts/ManagePosts.tsx b/datahub-web-react/src/app/settings/posts/ManagePosts.tsx
new file mode 100644
index 00000000000000..e0f694c192c629
--- /dev/null
+++ b/datahub-web-react/src/app/settings/posts/ManagePosts.tsx
@@ -0,0 +1,40 @@
+import { Typography } from 'antd';
+import React from 'react';
+import styled from 'styled-components/macro';
+import { PostList } from './PostsList';
+
+const PageContainer = styled.div`
+ padding-top: 20px;
+ width: 100%;
+ height: 100%;
+`;
+
+const PageHeaderContainer = styled.div`
+ && {
+ padding-left: 24px;
+ }
+`;
+
+const PageTitle = styled(Typography.Title)`
+ && {
+ margin-bottom: 12px;
+ }
+`;
+
+const ListContainer = styled.div``;
+
+export default function ManagePosts() {
+ return (
+
+
+ Home Page Posts
+
+ View and manage pinned posts that appear to all users on the landing page.
+
+
+
+
+
+
+ );
+}
diff --git a/datahub-web-react/src/app/settings/posts/PostItemMenu.tsx b/datahub-web-react/src/app/settings/posts/PostItemMenu.tsx
new file mode 100644
index 00000000000000..e3fc424a47ef28
--- /dev/null
+++ b/datahub-web-react/src/app/settings/posts/PostItemMenu.tsx
@@ -0,0 +1,62 @@
+import React from 'react';
+import { DeleteOutlined } from '@ant-design/icons';
+import { Dropdown, Menu, message, Modal } from 'antd';
+import { MenuIcon } from '../../entity/shared/EntityDropdown/EntityDropdown';
+import { useDeletePostMutation } from '../../../graphql/post.generated';
+
+type Props = {
+ urn: string;
+ title: string;
+ onDelete?: () => void;
+};
+
+export default function PostItemMenu({ title, urn, onDelete }: Props) {
+ const [deletePostMutation] = useDeletePostMutation();
+
+ const deletePost = () => {
+ deletePostMutation({
+ variables: {
+ urn,
+ },
+ })
+ .then(({ errors }) => {
+ if (!errors) {
+ message.success('Deleted Post!');
+ onDelete?.();
+ }
+ })
+ .catch(() => {
+ message.destroy();
+ message.error({ content: `Failed to delete Post!: An unknown error occurred.`, duration: 3 });
+ });
+ };
+
+ const onConfirmDelete = () => {
+ Modal.confirm({
+ title: `Delete Post '${title}'`,
+ content: `Are you sure you want to remove this Post?`,
+ onOk() {
+ deletePost();
+ },
+ onCancel() {},
+ okText: 'Yes',
+ maskClosable: true,
+ closable: true,
+ });
+ };
+
+ return (
+
+
+ Delete
+
+
+ }
+ >
+
+
+ );
+}
diff --git a/datahub-web-react/src/app/settings/posts/PostsList.tsx b/datahub-web-react/src/app/settings/posts/PostsList.tsx
new file mode 100644
index 00000000000000..5ae2be1547f9b7
--- /dev/null
+++ b/datahub-web-react/src/app/settings/posts/PostsList.tsx
@@ -0,0 +1,200 @@
+import React, { useEffect, useState } from 'react';
+import { Button, Empty, Pagination, Typography } from 'antd';
+import { useLocation } from 'react-router';
+import styled from 'styled-components';
+import * as QueryString from 'query-string';
+import { PlusOutlined } from '@ant-design/icons';
+import { AlignType } from 'rc-table/lib/interface';
+import CreatePostModal from './CreatePostModal';
+import { PostColumn, PostEntry, PostListMenuColumn } from './PostsListColumns';
+import { useEntityRegistry } from '../../useEntityRegistry';
+import { useListPostsQuery } from '../../../graphql/post.generated';
+import { scrollToTop } from '../../shared/searchUtils';
+import { addToListPostCache, removeFromListPostCache } from './utils';
+import { Message } from '../../shared/Message';
+import TabToolbar from '../../entity/shared/components/styled/TabToolbar';
+import { SearchBar } from '../../search/SearchBar';
+import { StyledTable } from '../../entity/shared/components/styled/StyledTable';
+import { POST_TYPE_TO_DISPLAY_TEXT } from './constants';
+
+const PostsContainer = styled.div``;
+
+export const PostsPaginationContainer = styled.div`
+ display: flex;
+ justify-content: center;
+ padding: 12px;
+ padding-left: 16px;
+ border-bottom: 1px solid;
+ border-color: ${(props) => props.theme.styles['border-color-base']};
+ display: flex;
+ justify-content: space-between;
+ align-items: center;
+`;
+
+const PaginationInfo = styled(Typography.Text)`
+ padding: 0px;
+`;
+
+const DEFAULT_PAGE_SIZE = 10;
+
+export const PostList = () => {
+ const entityRegistry = useEntityRegistry();
+ const location = useLocation();
+ const params = QueryString.parse(location.search, { arrayFormat: 'comma' });
+ const paramsQuery = (params?.query as string) || undefined;
+ const [query, setQuery] = useState(undefined);
+ useEffect(() => setQuery(paramsQuery), [paramsQuery]);
+
+ const [page, setPage] = useState(1);
+ const [isCreatingPost, setIsCreatingPost] = useState(false);
+
+ const pageSize = DEFAULT_PAGE_SIZE;
+ const start = (page - 1) * pageSize;
+
+ const { loading, error, data, client, refetch } = useListPostsQuery({
+ variables: {
+ input: {
+ start,
+ count: pageSize,
+ query,
+ },
+ },
+ fetchPolicy: query && query.length > 0 ? 'no-cache' : 'cache-first',
+ });
+
+ const totalPosts = data?.listPosts?.total || 0;
+ const lastResultIndex = start + pageSize > totalPosts ? totalPosts : start + pageSize;
+ const posts = data?.listPosts?.posts || [];
+
+ const onChangePage = (newPage: number) => {
+ scrollToTop();
+ setPage(newPage);
+ };
+
+ const handleDelete = (urn: string) => {
+ removeFromListPostCache(client, urn, page, pageSize);
+ setTimeout(() => {
+ refetch?.();
+ }, 2000);
+ };
+
+ const allColumns = [
+ {
+ title: 'Title',
+ dataIndex: '',
+ key: 'title',
+ sorter: (sourceA, sourceB) => {
+ return sourceA.title.localeCompare(sourceB.title);
+ },
+ render: (record: PostEntry) => PostColumn(record.title, 200),
+ width: '20%',
+ },
+ {
+ title: 'Description',
+ dataIndex: '',
+ key: 'description',
+ render: (record: PostEntry) => PostColumn(record.description || ''),
+ },
+ {
+ title: 'Type',
+ dataIndex: '',
+ key: 'type',
+ render: (record: PostEntry) => PostColumn(POST_TYPE_TO_DISPLAY_TEXT[record.contentType]),
+ style: { minWidth: 100 },
+ width: '10%',
+ },
+ {
+ title: '',
+ dataIndex: '',
+ width: '5%',
+ align: 'right' as AlignType,
+ key: 'menu',
+ render: PostListMenuColumn(handleDelete),
+ },
+ ];
+
+ const tableData = posts.map((post) => {
+ return {
+ urn: post.urn,
+ title: post.content.title,
+ description: post.content.description,
+ contentType: post.content.contentType,
+ };
+ });
+
+ return (
+ <>
+ {!data && loading && }
+ {error && }
+
+
+ setIsCreatingPost(true)}>
+ New Post
+
+ null}
+ onQueryChange={(q) => setQuery(q && q.length > 0 ? q : undefined)}
+ entityRegistry={entityRegistry}
+ hideRecommendations
+ />
+
+ }}
+ />
+ {totalPosts > pageSize && (
+
+
+
+ {lastResultIndex > 0 ? (page - 1) * pageSize + 1 : 0} - {lastResultIndex}
+ {' '}
+ of {totalPosts}
+
+
+
+
+ )}
+ {isCreatingPost && (
+ setIsCreatingPost(false)}
+ onCreate={(urn, title, description) => {
+ addToListPostCache(
+ client,
+ {
+ urn,
+ properties: {
+ title,
+ description: description || null,
+ },
+ },
+ pageSize,
+ );
+ setTimeout(() => refetch(), 2000);
+ }}
+ />
+ )}
+
+ >
+ );
+};
diff --git a/datahub-web-react/src/app/settings/posts/PostsListColumns.tsx b/datahub-web-react/src/app/settings/posts/PostsListColumns.tsx
new file mode 100644
index 00000000000000..38f910baf8f412
--- /dev/null
+++ b/datahub-web-react/src/app/settings/posts/PostsListColumns.tsx
@@ -0,0 +1,26 @@
+import React from 'react';
+// import { Typography } from 'antd';
+import styled from 'styled-components/macro';
+import { Maybe } from 'graphql/jsutils/Maybe';
+import PostItemMenu from './PostItemMenu';
+
+export interface PostEntry {
+ title: string;
+ contentType: string;
+ description: Maybe;
+ urn: string;
+}
+
+const PostText = styled.div<{ minWidth?: number }>`
+ ${(props) => props.minWidth !== undefined && `min-width: ${props.minWidth}px;`}
+`;
+
+export function PostListMenuColumn(handleDelete: (urn: string) => void) {
+ return (record: PostEntry) => (
+ handleDelete(record.urn)} />
+ );
+}
+
+export function PostColumn(text: string, minWidth?: number) {
+ return {text} ;
+}
diff --git a/datahub-web-react/src/app/settings/posts/constants.ts b/datahub-web-react/src/app/settings/posts/constants.ts
new file mode 100644
index 00000000000000..5a164019fe2e55
--- /dev/null
+++ b/datahub-web-react/src/app/settings/posts/constants.ts
@@ -0,0 +1,13 @@
+import { PostContentType } from '../../../types.generated';
+
+export const TITLE_FIELD_NAME = 'title';
+export const DESCRIPTION_FIELD_NAME = 'description';
+export const LINK_FIELD_NAME = 'link';
+export const LOCATION_FIELD_NAME = 'location';
+export const TYPE_FIELD_NAME = 'type';
+export const CREATE_POST_BUTTON_ID = 'createPostButton';
+
+export const POST_TYPE_TO_DISPLAY_TEXT = {
+ [PostContentType.Link]: 'Link',
+ [PostContentType.Text]: 'Announcement',
+};
diff --git a/datahub-web-react/src/app/settings/posts/utils.ts b/datahub-web-react/src/app/settings/posts/utils.ts
new file mode 100644
index 00000000000000..ce48c7400738ce
--- /dev/null
+++ b/datahub-web-react/src/app/settings/posts/utils.ts
@@ -0,0 +1,77 @@
+import { ListPostsDocument, ListPostsQuery } from '../../../graphql/post.generated';
+
+/**
+ * Add an entry to the list posts cache.
+ */
+export const addToListPostCache = (client, newPost, pageSize) => {
+ // Read the data from our cache for this query.
+ const currData: ListPostsQuery | null = client.readQuery({
+ query: ListPostsDocument,
+ variables: {
+ input: {
+ start: 0,
+ count: pageSize,
+ },
+ },
+ });
+
+ // Add our new post into the existing list.
+ const newPosts = [newPost, ...(currData?.listPosts?.posts || [])];
+
+ // Write our data back to the cache.
+ client.writeQuery({
+ query: ListPostsDocument,
+ variables: {
+ input: {
+ start: 0,
+ count: pageSize,
+ },
+ },
+ data: {
+ listPosts: {
+ start: 0,
+ count: (currData?.listPosts?.count || 0) + 1,
+ total: (currData?.listPosts?.total || 0) + 1,
+ posts: newPosts,
+ },
+ },
+ });
+};
+
+/**
+ * Remove an entry from the list posts cache.
+ */
+export const removeFromListPostCache = (client, urn, page, pageSize) => {
+ // Read the data from our cache for this query.
+ const currData: ListPostsQuery | null = client.readQuery({
+ query: ListPostsDocument,
+ variables: {
+ input: {
+ start: (page - 1) * pageSize,
+ count: pageSize,
+ },
+ },
+ });
+
+ // Remove the post from the existing posts set.
+ const newPosts = [...(currData?.listPosts?.posts || []).filter((post) => post.urn !== urn)];
+
+ // Write our data back to the cache.
+ client.writeQuery({
+ query: ListPostsDocument,
+ variables: {
+ input: {
+ start: (page - 1) * pageSize,
+ count: pageSize,
+ },
+ },
+ data: {
+ listPosts: {
+ start: currData?.listPosts?.start || 0,
+ count: (currData?.listPosts?.count || 1) - 1,
+ total: (currData?.listPosts?.total || 1) - 1,
+ posts: newPosts,
+ },
+ },
+ });
+};
diff --git a/datahub-web-react/src/app/shared/tags/tag/Tag.tsx b/datahub-web-react/src/app/shared/tags/tag/Tag.tsx
index 2288238091776b..ed2460b6eea3ce 100644
--- a/datahub-web-react/src/app/shared/tags/tag/Tag.tsx
+++ b/datahub-web-react/src/app/shared/tags/tag/Tag.tsx
@@ -8,6 +8,7 @@ import { StyledTag } from '../../../entity/shared/components/styled/StyledTag';
import { HoverEntityTooltip } from '../../../recommendations/renderer/component/HoverEntityTooltip';
import { useEntityRegistry } from '../../../useEntityRegistry';
import { TagProfileDrawer } from '../TagProfileDrawer';
+import { useHasMatchedFieldByUrn } from '../../../search/context/SearchResultContext';
const TagLink = styled.span`
display: inline-block;
@@ -41,6 +42,7 @@ export default function Tag({
}: Props) {
const entityRegistry = useEntityRegistry();
const [removeTagMutation] = useRemoveTagMutation();
+ const highlightTag = useHasMatchedFieldByUrn(tag.tag.urn, 'tags');
const [tagProfileDrawerVisible, setTagProfileDrawerVisible] = useState(false);
const [addTagUrn, setAddTagUrn] = useState('');
@@ -110,6 +112,7 @@ export default function Tag({
removeTag(tag);
}}
fontSize={fontSize}
+ highlightTag={highlightTag}
>
`
+const StyledTag = styled(Tag)<{ fontSize?: number; highlightTerm?: boolean }>`
+ &&& {
+ ${(props) =>
+ props.highlightTerm &&
+ `
+ background: ${props.theme.styles['highlight-color']};
+ border: 1px solid ${props.theme.styles['highlight-border-color']};
+ `}
+ }
${(props) => props.fontSize && `font-size: ${props.fontSize}px;`}
`;
@@ -38,6 +47,7 @@ export default function TermContent({
}: Props) {
const entityRegistry = useEntityRegistry();
const [removeTermMutation] = useRemoveTermMutation();
+ const highlightTerm = useHasMatchedFieldByUrn(term.term.urn, 'glossaryTerms');
const removeTerm = (termToRemove: GlossaryTermAssociation) => {
onOpenModal?.();
@@ -85,6 +95,7 @@ export default function TermContent({
removeTerm(term);
}}
fontSize={fontSize}
+ highlightTerm={highlightTerm}
>
diff --git a/datahub-web-react/src/appConfigContext.tsx b/datahub-web-react/src/appConfigContext.tsx
index 3b34b108ecc93d..807a17c4fd6a43 100644
--- a/datahub-web-react/src/appConfigContext.tsx
+++ b/datahub-web-react/src/appConfigContext.tsx
@@ -27,6 +27,9 @@ export const DEFAULT_APP_CONFIG = {
entityProfile: {
domainDefaultTab: null,
},
+ searchResult: {
+ enableNameHighlight: false,
+ },
},
authConfig: {
tokenAuthEnabled: false,
diff --git a/datahub-web-react/src/conf/Global.ts b/datahub-web-react/src/conf/Global.ts
index b16dd1eaace57b..e1220b8c81b53c 100644
--- a/datahub-web-react/src/conf/Global.ts
+++ b/datahub-web-react/src/conf/Global.ts
@@ -28,6 +28,7 @@ export enum PageRoutes {
SETTINGS_VIEWS = '/settings/views',
EMBED = '/embed',
EMBED_LOOKUP = '/embed/lookup/:url',
+ SETTINGS_POSTS = '/settings/posts',
}
/**
diff --git a/datahub-web-react/src/conf/theme/theme_dark.config.json b/datahub-web-react/src/conf/theme/theme_dark.config.json
index b648f3d997f217..9746c3ddde5f37 100644
--- a/datahub-web-react/src/conf/theme/theme_dark.config.json
+++ b/datahub-web-react/src/conf/theme/theme_dark.config.json
@@ -17,7 +17,9 @@
"disabled-color": "fade(white, 25%)",
"steps-nav-arrow-color": "fade(white, 25%)",
"homepage-background-upper-fade": "#FFFFFF",
- "homepage-background-lower-fade": "#333E4C"
+ "homepage-background-lower-fade": "#333E4C",
+ "highlight-color": "#E6F4FF",
+ "highlight-border-color": "#BAE0FF"
},
"assets": {
"logoUrl": "/assets/logo.png"
diff --git a/datahub-web-react/src/conf/theme/theme_light.config.json b/datahub-web-react/src/conf/theme/theme_light.config.json
index e842fdb1bb8aad..906c04e38a1baf 100644
--- a/datahub-web-react/src/conf/theme/theme_light.config.json
+++ b/datahub-web-react/src/conf/theme/theme_light.config.json
@@ -20,7 +20,9 @@
"homepage-background-lower-fade": "#FFFFFF",
"homepage-text-color": "#434343",
"box-shadow": "0px 0px 30px 0px rgb(239 239 239)",
- "box-shadow-hover": "0px 1px 0px 0.5px rgb(239 239 239)"
+ "box-shadow-hover": "0px 1px 0px 0.5px rgb(239 239 239)",
+ "highlight-color": "#E6F4FF",
+ "highlight-border-color": "#BAE0FF"
},
"assets": {
"logoUrl": "/assets/logo.png"
diff --git a/datahub-web-react/src/conf/theme/types.ts b/datahub-web-react/src/conf/theme/types.ts
index 98140cbbd553d9..7d78230092700a 100644
--- a/datahub-web-react/src/conf/theme/types.ts
+++ b/datahub-web-react/src/conf/theme/types.ts
@@ -18,6 +18,8 @@ export type Theme = {
'homepage-background-lower-fade': string;
'box-shadow': string;
'box-shadow-hover': string;
+ 'highlight-color': string;
+ 'highlight-border-color': string;
};
assets: {
logoUrl: string;
diff --git a/datahub-web-react/src/graphql/app.graphql b/datahub-web-react/src/graphql/app.graphql
index 4b1295f1024a29..bf15e5f757f8f4 100644
--- a/datahub-web-react/src/graphql/app.graphql
+++ b/datahub-web-react/src/graphql/app.graphql
@@ -45,6 +45,9 @@ query appConfig {
defaultTab
}
}
+ searchResult {
+ enableNameHighlight
+ }
}
telemetryConfig {
enableThirdPartyLogging
diff --git a/datahub-web-react/src/graphql/me.graphql b/datahub-web-react/src/graphql/me.graphql
index 2c693c747af56c..af850c9c3ce286 100644
--- a/datahub-web-react/src/graphql/me.graphql
+++ b/datahub-web-react/src/graphql/me.graphql
@@ -46,6 +46,7 @@ query getMe {
createTags
manageGlobalViews
manageOwnershipTypes
+ manageGlobalAnnouncements
}
}
}
diff --git a/datahub-web-react/src/graphql/post.graphql b/datahub-web-react/src/graphql/post.graphql
index c19f38fc7751c1..ee092ad4fba90d 100644
--- a/datahub-web-react/src/graphql/post.graphql
+++ b/datahub-web-react/src/graphql/post.graphql
@@ -20,3 +20,11 @@ query listPosts($input: ListPostsInput!) {
}
}
}
+
+mutation createPost($input: CreatePostInput!) {
+ createPost(input: $input)
+}
+
+mutation deletePost($urn: String!) {
+ deletePost(urn: $urn)
+}
diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql
index 172a6d957e2874..7cd868d7cd2b2b 100644
--- a/datahub-web-react/src/graphql/search.graphql
+++ b/datahub-web-react/src/graphql/search.graphql
@@ -832,6 +832,11 @@ fragment searchResults on SearchResults {
matchedFields {
name
value
+ entity {
+ urn
+ type
+ ...entityDisplayNameFields
+ }
}
insights {
text
@@ -841,6 +846,11 @@ fragment searchResults on SearchResults {
facets {
...facetFields
}
+ suggestions {
+ text
+ frequency
+ score
+ }
}
fragment schemaFieldEntityFields on SchemaFieldEntity {
diff --git a/docker/airflow/local_airflow.md b/docker/airflow/local_airflow.md
index d0a2b18cff2d27..55a64f5c122c51 100644
--- a/docker/airflow/local_airflow.md
+++ b/docker/airflow/local_airflow.md
@@ -138,25 +138,57 @@ Successfully added `conn_id`=datahub_rest_default : datahub_rest://:@http://data
Navigate the Airflow UI to find the sample Airflow dag we just brought in
-![Find the DAG](../../docs/imgs/airflow/find_the_dag.png)
+
+
+
+
+
By default, Airflow loads all DAG-s in paused status. Unpause the sample DAG to use it.
-![Paused DAG](../../docs/imgs/airflow/paused_dag.png)
-![Unpaused DAG](../../docs/imgs/airflow/unpaused_dag.png)
+
+
+
+
+
+
+
+
+
+
Then trigger the DAG to run.
-![Trigger the DAG](../../docs/imgs/airflow/trigger_dag.png)
+
+
+
+
+
After the DAG runs successfully, go over to your DataHub instance to see the Pipeline and navigate its lineage.
-![DataHub Pipeline View](../../docs/imgs/airflow/datahub_pipeline_view.png)
-![DataHub Pipeline Entity](../../docs/imgs/airflow/datahub_pipeline_entity.png)
+
+
+
+
+
+
+
+
+
-![DataHub Task View](../../docs/imgs/airflow/datahub_task_view.png)
-![DataHub Lineage View](../../docs/imgs/airflow/datahub_lineage_view.png)
+
+
+
+
+
+
+
+
+
+
+
## TroubleShooting
@@ -164,9 +196,17 @@ Most issues are related to connectivity between Airflow and DataHub.
Here is how you can debug them.
-![Find the Task Log](../../docs/imgs/airflow/finding_failed_log.png)
-![Inspect the Log](../../docs/imgs/airflow/connection_error.png)
+
+
+
+
+
+
+
+
+
+
In this case, clearly the connection `datahub-rest` has not been registered. Looks like we forgot to register the connection with Airflow!
Let's execute Step 4 to register the datahub connection with Airflow.
@@ -175,4 +215,8 @@ In case the connection was registered successfully but you are still seeing `Fai
After re-running the DAG, we see success!
-![Pipeline Success](../../docs/imgs/airflow/successful_run.png)
+
+
+
+
+
diff --git a/docker/build.gradle b/docker/build.gradle
index 829bc344411f3e..ae101fe1defc5f 100644
--- a/docker/build.gradle
+++ b/docker/build.gradle
@@ -87,6 +87,7 @@ task quickstartDebug(type: Exec, dependsOn: ':metadata-ingestion:install') {
dependsOn(debug_modules.collect { it + ':dockerTagDebug' })
shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke'
+ environment "DATAHUB_PRECREATE_TOPICS", "true"
environment "DATAHUB_TELEMETRY_ENABLED", "false"
environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}"
diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile
index bb4b0bc42e167c..3d47f796173704 100644
--- a/docker/datahub-ingestion-base/Dockerfile
+++ b/docker/datahub-ingestion-base/Dockerfile
@@ -84,4 +84,4 @@ FROM ${BASE_IMAGE} as slim-install
FROM ${APP_ENV}-install
USER datahub
-ENV PATH="/datahub-ingestion/.local/bin:$PATH"
+ENV PATH="/datahub-ingestion/.local/bin:$PATH"
\ No newline at end of file
diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile
index d16caea2fcecd1..0ecc30d02ac3f7 100644
--- a/docker/datahub-ingestion/Dockerfile
+++ b/docker/datahub-ingestion/Dockerfile
@@ -30,4 +30,4 @@ FROM base as dev-install
FROM ${APP_ENV}-install as final
USER datahub
-ENV PATH="/datahub-ingestion/.local/bin:$PATH"
\ No newline at end of file
+ENV PATH="/datahub-ingestion/.local/bin:$PATH"
diff --git a/docker/kafka-setup/Dockerfile b/docker/kafka-setup/Dockerfile
index 8cf9d0869dc9b8..a9c75521fead13 100644
--- a/docker/kafka-setup/Dockerfile
+++ b/docker/kafka-setup/Dockerfile
@@ -1,5 +1,7 @@
+ARG KAFKA_DOCKER_VERSION=7.4.1
+
# Using as a base image because to get the needed jars for confluent utils
-FROM confluentinc/cp-base-new@sha256:ac4e0f9bcaecdab728740529f37452231fa40760fcf561759fc3b219f46d2cc9 as confluent_base
+FROM confluentinc/cp-base-new:$KAFKA_DOCKER_VERSION as confluent_base
ARG MAVEN_REPO="https://repo1.maven.org/maven2"
ARG SNAKEYAML_VERSION="2.0"
@@ -13,15 +15,6 @@ FROM python:3-alpine
ENV KAFKA_VERSION 3.4.1
ENV SCALA_VERSION 2.13
-# Set the classpath for JARs required by `cub`
-ENV CUB_CLASSPATH='"/usr/share/java/cp-base-new/*"'
-
-# Confluent Docker Utils Version (Namely the tag or branch to grab from git to install)
-ARG PYTHON_CONFLUENT_DOCKER_UTILS_VERSION="v0.0.60"
-
-# This can be overriden for an offline/air-gapped builds
-ARG PYTHON_CONFLUENT_DOCKER_UTILS_INSTALL_SPEC="git+https://github.com/confluentinc/confluent-docker-utils@${PYTHON_CONFLUENT_DOCKER_UTILS_VERSION}"
-
LABEL name="kafka" version=${KAFKA_VERSION}
RUN apk add --no-cache bash coreutils
@@ -35,11 +28,6 @@ RUN mkdir -p /opt \
&& mv /opt/kafka_${SCALA_VERSION}-${KAFKA_VERSION} /opt/kafka \
&& adduser -DH -s /sbin/nologin kafka \
&& chown -R kafka: /opt/kafka \
- && echo "===> Installing python packages ..." \
- && pip install --no-cache-dir --upgrade pip wheel setuptools \
- && pip install jinja2 requests \
- && pip install "Cython<3.0" "PyYAML<6" --no-build-isolation \
- && pip install --prefer-binary --prefix=/usr/local --upgrade "${PYTHON_CONFLUENT_DOCKER_UTILS_INSTALL_SPEC}" \
&& rm -rf /tmp/* \
&& apk del --purge .build-deps
@@ -69,7 +57,8 @@ ENV USE_CONFLUENT_SCHEMA_REGISTRY="TRUE"
COPY docker/kafka-setup/kafka-setup.sh ./kafka-setup.sh
COPY docker/kafka-setup/kafka-config.sh ./kafka-config.sh
COPY docker/kafka-setup/kafka-topic-workers.sh ./kafka-topic-workers.sh
+COPY docker/kafka-setup/kafka-ready.sh ./kafka-ready.sh
-RUN chmod +x ./kafka-setup.sh && chmod +x ./kafka-topic-workers.sh
+RUN chmod +x ./kafka-setup.sh ./kafka-topic-workers.sh ./kafka-ready.sh
CMD ./kafka-setup.sh
diff --git a/docker/kafka-setup/kafka-ready.sh b/docker/kafka-setup/kafka-ready.sh
new file mode 100755
index 00000000000000..ba87bde047ef57
--- /dev/null
+++ b/docker/kafka-setup/kafka-ready.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+for i in {1..60}
+do
+ kafka-broker-api-versions.sh --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER
+ if [ $? -eq 0 ]; then
+ break
+ fi
+ if [ $i -eq 60 ]; then
+ echo "Kafka bootstrap server $KAFKA_BOOTSTRAP_SERVER not ready."
+ exit 1
+ fi
+ sleep 5s
+done
diff --git a/docker/kafka-setup/kafka-setup.sh b/docker/kafka-setup/kafka-setup.sh
old mode 100644
new mode 100755
index 7b015421b7963b..629e9bc9484ee1
--- a/docker/kafka-setup/kafka-setup.sh
+++ b/docker/kafka-setup/kafka-setup.sh
@@ -49,8 +49,8 @@ if [[ -n "$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" ]]; then
echo "sasl.client.callback.handler.class=$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" >> $CONNECTION_PROPERTIES_PATH
fi
-cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180
-
+# cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180
+. kafka-ready.sh
############################################################
# Start Topic Creation Logic
diff --git a/docs-website/build.gradle b/docs-website/build.gradle
index 12f37033efc2f7..851c10d9ea97f1 100644
--- a/docs-website/build.gradle
+++ b/docs-website/build.gradle
@@ -77,7 +77,12 @@ task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall,
args = ['run', 'generate']
}
-task yarnStart(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) {
+task downloadHistoricalVersions(type: Exec) {
+ workingDir '.'
+ commandLine 'python3', 'download_historical_versions.py'
+}
+
+task yarnStart(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate, downloadHistoricalVersions]) {
args = ['run', 'start']
}
task fastReload(type: YarnTask) {
@@ -105,7 +110,7 @@ task serve(type: YarnTask, dependsOn: [yarnInstall] ) {
}
-task yarnBuild(type: YarnTask, dependsOn: [yarnLint, yarnGenerate]) {
+task yarnBuild(type: YarnTask, dependsOn: [yarnLint, yarnGenerate, downloadHistoricalVersions]) {
inputs.files(projectMdFiles)
inputs.file("package.json").withPathSensitivity(PathSensitivity.RELATIVE)
inputs.dir("src").withPathSensitivity(PathSensitivity.RELATIVE)
diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js
index c10c178424b53f..df69e8513fbfc4 100644
--- a/docs-website/docusaurus.config.js
+++ b/docs-website/docusaurus.config.js
@@ -69,6 +69,11 @@ module.exports = {
label: "Roadmap",
position: "right",
},
+ {
+ type: 'docsVersionDropdown',
+ position: 'right',
+ dropdownActiveClassDisabled: true,
+ },
{
href: "https://slack.datahubproject.io",
"aria-label": "Slack",
diff --git a/docs-website/download_historical_versions.py b/docs-website/download_historical_versions.py
new file mode 100644
index 00000000000000..a005445cb14970
--- /dev/null
+++ b/docs-website/download_historical_versions.py
@@ -0,0 +1,60 @@
+import os
+import tarfile
+import urllib.request
+import json
+
+repo_url = "https://api.github.com/repos/datahub-project/static-assets"
+
+
+def download_file(url, destination):
+ with urllib.request.urlopen(url) as response:
+ with open(destination, "wb") as f:
+ while True:
+ chunk = response.read(8192)
+ if not chunk:
+ break
+ f.write(chunk)
+
+
+def fetch_tar_urls(repo_url, folder_path):
+ api_url = f"{repo_url}/contents/{folder_path}"
+ response = urllib.request.urlopen(api_url)
+ data = response.read().decode('utf-8')
+ tar_urls = [
+ file["download_url"] for file in json.loads(data) if file["name"].endswith(".tar.gz")
+ ]
+ print(tar_urls)
+ return tar_urls
+
+
+def main():
+ folder_path = "versioned_docs"
+ destination_dir = "versioned_docs"
+ if not os.path.exists(destination_dir):
+ os.makedirs(destination_dir)
+
+ tar_urls = fetch_tar_urls(repo_url, folder_path)
+
+ for url in tar_urls:
+ filename = os.path.basename(url)
+ destination_path = os.path.join(destination_dir, filename)
+
+ version = '.'.join(filename.split('.')[:3])
+ extracted_path = os.path.join(destination_dir, version)
+ print("extracted_path", extracted_path)
+ if os.path.exists(extracted_path):
+ print(f"{extracted_path} already exists, skipping downloads")
+ continue
+ try:
+ download_file(url, destination_path)
+ print(f"Downloaded {filename} to {destination_dir}")
+ with tarfile.open(destination_path, "r:gz") as tar:
+ tar.extractall()
+ os.remove(destination_path)
+ except urllib.error.URLError as e:
+ print(f"Error while downloading {filename}: {e}")
+ continue
+
+
+if __name__ == "__main__":
+ main()
diff --git a/docs-website/src/pages/docs/_components/SearchBar/index.jsx b/docs-website/src/pages/docs/_components/SearchBar/index.jsx
index 37f8a5c252aee6..054c041d8a9e5d 100644
--- a/docs-website/src/pages/docs/_components/SearchBar/index.jsx
+++ b/docs-website/src/pages/docs/_components/SearchBar/index.jsx
@@ -303,11 +303,16 @@ function SearchBar() {
strokeLinejoin="round"
>
-
- {docsSearchVersionsHelpers.versioningEnabled && }
-
- {!!searchResultState.totalResults && documentsFoundPlural(searchResultState.totalResults)}
+ {docsSearchVersionsHelpers.versioningEnabled && (
+
+ )}
+
+ {!!searchResultState.totalResults &&
+ documentsFoundPlural(searchResultState.totalResults)}
+
{searchResultState.items.length > 0 ? (
@@ -369,4 +374,4 @@ function SearchBar() {
);
}
-export default SearchBar;
+export default SearchBar;
\ No newline at end of file
diff --git a/docs-website/src/pages/docs/_components/SearchBar/search.module.scss b/docs-website/src/pages/docs/_components/SearchBar/search.module.scss
index 17e5f224906646..30a2973384ba69 100644
--- a/docs-website/src/pages/docs/_components/SearchBar/search.module.scss
+++ b/docs-website/src/pages/docs/_components/SearchBar/search.module.scss
@@ -21,13 +21,21 @@
height: 1.5rem;
}
+.searchQueryInput {
+ padding: 0.8rem 0.8rem 0.8rem 3rem;
+}
+
+.searchVersionInput {
+ padding: 0.8rem 2rem 0.8rem 2rem;
+ text-align: center;
+}
+
.searchQueryInput,
.searchVersionInput {
border-radius: 1000em;
border-style: solid;
border-color: transparent;
font: var(--ifm-font-size-base) var(--ifm-font-family-base);
- padding: 0.8rem 0.8rem 0.8rem 3rem;
width: 100%;
background: var(--docsearch-searchbox-background);
color: var(--docsearch-text-color);
@@ -93,6 +101,7 @@
@media only screen and (max-width: 996px) {
.searchVersionColumn {
max-width: 40% !important;
+ margin: auto;
}
.searchResultsColumn {
@@ -113,9 +122,15 @@
.searchVersionColumn {
max-width: 100% !important;
padding-left: var(--ifm-spacing-horizontal) !important;
+ margin: auto;
}
}
+.searchVersionColumn {
+ margin: auto;
+}
+
+
.loadingSpinner {
width: 3rem;
height: 3rem;
diff --git a/docs-website/versioned_sidebars/version-0.10.5-sidebars.json b/docs-website/versioned_sidebars/version-0.10.5-sidebars.json
new file mode 100644
index 00000000000000..67179075fc994a
--- /dev/null
+++ b/docs-website/versioned_sidebars/version-0.10.5-sidebars.json
@@ -0,0 +1,594 @@
+{
+ "overviewSidebar": [
+ {
+ "label": "Getting Started",
+ "type": "category",
+ "collapsed": true,
+ "items": [
+ {
+ "type": "doc",
+ "label": "Introduction",
+ "id": "docs/features"
+ },
+ {
+ "type": "doc",
+ "label": "Quickstart",
+ "id": "docs/quickstart"
+ },
+ {
+ "type": "link",
+ "label": "Demo",
+ "href": "https://demo.datahubproject.io/"
+ },
+ "docs/what-is-datahub/datahub-concepts",
+ "docs/saas"
+ ]
+ },
+ {
+ "Integrations": [
+ {
+ "type": "doc",
+ "label": "Introduction",
+ "id": "metadata-ingestion/README"
+ },
+ {
+ "Quickstart Guides": [
+ {
+ "BigQuery": [
+ "docs/quick-ingestion-guides/bigquery/overview",
+ "docs/quick-ingestion-guides/bigquery/setup",
+ "docs/quick-ingestion-guides/bigquery/configuration"
+ ]
+ },
+ {
+ "Redshift": [
+ "docs/quick-ingestion-guides/redshift/overview",
+ "docs/quick-ingestion-guides/redshift/setup",
+ "docs/quick-ingestion-guides/redshift/configuration"
+ ]
+ },
+ {
+ "Snowflake": [
+ "docs/quick-ingestion-guides/snowflake/overview",
+ "docs/quick-ingestion-guides/snowflake/setup",
+ "docs/quick-ingestion-guides/snowflake/configuration"
+ ]
+ },
+ {
+ "Tableau": [
+ "docs/quick-ingestion-guides/tableau/overview",
+ "docs/quick-ingestion-guides/tableau/setup",
+ "docs/quick-ingestion-guides/tableau/configuration"
+ ]
+ },
+ {
+ "PowerBI": [
+ "docs/quick-ingestion-guides/powerbi/overview",
+ "docs/quick-ingestion-guides/powerbi/setup",
+ "docs/quick-ingestion-guides/powerbi/configuration"
+ ]
+ }
+ ]
+ },
+ {
+ "Sources": [
+ {
+ "type": "doc",
+ "id": "docs/lineage/airflow",
+ "label": "Airflow"
+ },
+ "metadata-integration/java/spark-lineage/README",
+ "metadata-ingestion/integration_docs/great-expectations",
+ "metadata-integration/java/datahub-protobuf/README",
+ {
+ "type": "autogenerated",
+ "dirName": "docs/generated/ingestion/sources"
+ }
+ ]
+ },
+ {
+ "Sinks": [
+ {
+ "type": "autogenerated",
+ "dirName": "metadata-ingestion/sink_docs"
+ }
+ ]
+ },
+ {
+ "Transformers": [
+ "metadata-ingestion/docs/transformer/intro",
+ "metadata-ingestion/docs/transformer/dataset_transformer"
+ ]
+ },
+ {
+ "Advanced Guides": [
+ {
+ "Scheduling Ingestion": [
+ "metadata-ingestion/schedule_docs/intro",
+ "metadata-ingestion/schedule_docs/cron",
+ "metadata-ingestion/schedule_docs/airflow",
+ "metadata-ingestion/schedule_docs/kubernetes"
+ ]
+ },
+ "docs/platform-instances",
+ "metadata-ingestion/docs/dev_guides/stateful",
+ "metadata-ingestion/docs/dev_guides/classification",
+ "metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source",
+ "metadata-ingestion/docs/dev_guides/sql_profiles"
+ ]
+ }
+ ]
+ },
+ {
+ "Deployment": [
+ "docs/deploy/aws",
+ "docs/deploy/gcp",
+ "docker/README",
+ "docs/deploy/kubernetes",
+ "docs/deploy/environment-vars",
+ {
+ "Authentication": [
+ "docs/authentication/README",
+ "docs/authentication/concepts",
+ "docs/authentication/changing-default-credentials",
+ "docs/authentication/guides/add-users",
+ {
+ "Frontend Authentication": [
+ "docs/authentication/guides/jaas",
+ {
+ "OIDC Authentication": [
+ "docs/authentication/guides/sso/configure-oidc-react",
+ "docs/authentication/guides/sso/configure-oidc-react-google",
+ "docs/authentication/guides/sso/configure-oidc-react-okta",
+ "docs/authentication/guides/sso/configure-oidc-react-azure"
+ ]
+ }
+ ]
+ },
+ "docs/authentication/introducing-metadata-service-authentication",
+ "docs/authentication/personal-access-tokens"
+ ]
+ },
+ {
+ "Authorization": [
+ "docs/authorization/README",
+ "docs/authorization/roles",
+ "docs/authorization/policies",
+ "docs/authorization/groups"
+ ]
+ },
+ {
+ "Advanced Guides": [
+ "docs/how/delete-metadata",
+ "docs/how/configuring-authorization-with-apache-ranger",
+ "docs/how/backup-datahub",
+ "docs/how/restore-indices",
+ "docs/advanced/db-retention",
+ "docs/advanced/monitoring",
+ "docs/how/extract-container-logs",
+ "docs/deploy/telemetry",
+ "docs/how/kafka-config",
+ "docs/deploy/confluent-cloud",
+ "docs/advanced/no-code-upgrade",
+ "docs/how/jattach-guide"
+ ]
+ },
+ "docs/how/updating-datahub"
+ ]
+ },
+ {
+ "API": [
+ "docs/api/datahub-apis",
+ {
+ "GraphQL API": [
+ {
+ "label": "Overview",
+ "type": "doc",
+ "id": "docs/api/graphql/overview"
+ },
+ {
+ "Reference": [
+ {
+ "type": "doc",
+ "label": "Queries",
+ "id": "graphql/queries"
+ },
+ {
+ "type": "doc",
+ "label": "Mutations",
+ "id": "graphql/mutations"
+ },
+ {
+ "type": "doc",
+ "label": "Objects",
+ "id": "graphql/objects"
+ },
+ {
+ "type": "doc",
+ "label": "Inputs",
+ "id": "graphql/inputObjects"
+ },
+ {
+ "type": "doc",
+ "label": "Interfaces",
+ "id": "graphql/interfaces"
+ },
+ {
+ "type": "doc",
+ "label": "Unions",
+ "id": "graphql/unions"
+ },
+ {
+ "type": "doc",
+ "label": "Enums",
+ "id": "graphql/enums"
+ },
+ {
+ "type": "doc",
+ "label": "Scalars",
+ "id": "graphql/scalars"
+ }
+ ]
+ },
+ {
+ "Guides": [
+ {
+ "type": "doc",
+ "label": "How To Set Up GraphQL",
+ "id": "docs/api/graphql/how-to-set-up-graphql"
+ },
+ {
+ "type": "doc",
+ "label": "Getting Started With GraphQL",
+ "id": "docs/api/graphql/getting-started"
+ },
+ {
+ "type": "doc",
+ "label": "Access Token Management",
+ "id": "docs/api/graphql/token-management"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "type": "doc",
+ "label": "OpenAPI",
+ "id": "docs/api/openapi/openapi-usage-guide"
+ },
+ "docs/dev-guides/timeline",
+ {
+ "Rest.li API": [
+ {
+ "type": "doc",
+ "label": "Rest.li API Guide",
+ "id": "docs/api/restli/restli-overview"
+ },
+ {
+ "type": "doc",
+ "label": "Restore Indices",
+ "id": "docs/api/restli/restore-indices"
+ },
+ {
+ "type": "doc",
+ "label": "Get Index Sizes",
+ "id": "docs/api/restli/get-index-sizes"
+ },
+ {
+ "type": "doc",
+ "label": "Truncate Timeseries Aspect",
+ "id": "docs/api/restli/truncate-time-series-aspect"
+ },
+ {
+ "type": "doc",
+ "label": "Get ElasticSearch Task Status Endpoint",
+ "id": "docs/api/restli/get-elastic-task-status"
+ },
+ {
+ "type": "doc",
+ "label": "Evaluate Tests",
+ "id": "docs/api/restli/evaluate-tests"
+ },
+ {
+ "type": "doc",
+ "label": "Aspect Versioning and Rest.li Modeling",
+ "id": "docs/advanced/aspect-versioning"
+ }
+ ]
+ },
+ {
+ "Python SDK": [
+ "metadata-ingestion/as-a-library",
+ {
+ "Python SDK Reference": [
+ {
+ "type": "autogenerated",
+ "dirName": "python-sdk"
+ }
+ ]
+ }
+ ]
+ },
+ "metadata-integration/java/as-a-library",
+ {
+ "API and SDK Guides": [
+ "docs/advanced/patch",
+ "docs/api/tutorials/datasets",
+ "docs/api/tutorials/lineage",
+ "docs/api/tutorials/tags",
+ "docs/api/tutorials/terms",
+ "docs/api/tutorials/owners",
+ "docs/api/tutorials/domains",
+ "docs/api/tutorials/deprecation",
+ "docs/api/tutorials/descriptions",
+ "docs/api/tutorials/custom-properties",
+ "docs/api/tutorials/ml"
+ ]
+ },
+ {
+ "type": "category",
+ "label": "DataHub CLI",
+ "link": {
+ "type": "doc",
+ "id": "docs/cli"
+ },
+ "items": [
+ "docs/datahub_lite"
+ ]
+ },
+ {
+ "type": "category",
+ "label": "Datahub Actions",
+ "link": {
+ "type": "doc",
+ "id": "docs/act-on-metadata"
+ },
+ "items": [
+ "docs/actions/README",
+ "docs/actions/quickstart",
+ "docs/actions/concepts",
+ {
+ "Sources": [
+ {
+ "type": "autogenerated",
+ "dirName": "docs/actions/sources"
+ }
+ ]
+ },
+ {
+ "Events": [
+ {
+ "type": "autogenerated",
+ "dirName": "docs/actions/events"
+ }
+ ]
+ },
+ {
+ "Actions": [
+ {
+ "type": "autogenerated",
+ "dirName": "docs/actions/actions"
+ }
+ ]
+ },
+ {
+ "Guides": [
+ {
+ "type": "autogenerated",
+ "dirName": "docs/actions/guides"
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "Features": [
+ "docs/ui-ingestion",
+ "docs/how/search",
+ "docs/schema-history",
+ "docs/domains",
+ "docs/dataproducts",
+ "docs/glossary/business-glossary",
+ "docs/tags",
+ "docs/ownership/ownership-types",
+ "docs/browse",
+ "docs/authorization/access-policies-guide",
+ "docs/features/dataset-usage-and-query-history",
+ "docs/posts",
+ "docs/sync-status",
+ "docs/lineage/lineage-feature-guide",
+ {
+ "type": "doc",
+ "id": "docs/tests/metadata-tests",
+ "className": "saasOnly"
+ },
+ "docs/act-on-metadata/impact-analysis",
+ {
+ "Observability": [
+ "docs/managed-datahub/observe/freshness-assertions"
+ ]
+ }
+ ]
+ },
+ {
+ "Develop": [
+ {
+ "DataHub Metadata Model": [
+ "docs/modeling/metadata-model",
+ "docs/modeling/extending-the-metadata-model",
+ "docs/what/mxe",
+ {
+ "Entities": [
+ {
+ "type": "autogenerated",
+ "dirName": "docs/generated/metamodel/entities"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "Architecture": [
+ "docs/architecture/architecture",
+ "docs/components",
+ "docs/architecture/metadata-ingestion",
+ "docs/architecture/metadata-serving",
+ "docs/architecture/docker-containers"
+ ]
+ },
+ {
+ "Developing on DataHub": [
+ "docs/developers",
+ "docs/docker/development",
+ "metadata-ingestion/developing",
+ "docs/api/graphql/graphql-endpoint-development",
+ {
+ "Modules": [
+ "datahub-web-react/README",
+ "datahub-frontend/README",
+ "datahub-graphql-core/README",
+ "metadata-service/README",
+ "metadata-jobs/mae-consumer-job/README",
+ "metadata-jobs/mce-consumer-job/README"
+ ]
+ }
+ ]
+ },
+ "docs/plugins",
+ {
+ "Troubleshooting": [
+ "docs/troubleshooting/quickstart",
+ "docs/troubleshooting/build",
+ "docs/troubleshooting/general"
+ ]
+ },
+ {
+ "Advanced": [
+ "metadata-ingestion/docs/dev_guides/reporting_telemetry",
+ "docs/advanced/mcp-mcl",
+ "docker/datahub-upgrade/README",
+ "docs/advanced/no-code-modeling",
+ "datahub-web-react/src/app/analytics/README",
+ "docs/how/migrating-graph-service-implementation",
+ "docs/advanced/field-path-spec-v2",
+ "metadata-ingestion/adding-source",
+ "docs/how/add-custom-ingestion-source",
+ "docs/how/add-custom-data-platform",
+ "docs/advanced/browse-paths-upgrade",
+ "docs/browseV2/browse-paths-v2"
+ ]
+ }
+ ]
+ },
+ {
+ "Community": [
+ "docs/slack",
+ "docs/townhalls",
+ "docs/townhall-history",
+ "docs/CODE_OF_CONDUCT",
+ "docs/CONTRIBUTING",
+ "docs/links",
+ "docs/rfc"
+ ]
+ },
+ {
+ "Managed DataHub": [
+ "docs/managed-datahub/managed-datahub-overview",
+ "docs/managed-datahub/welcome-acryl",
+ {
+ "type": "doc",
+ "id": "docs/managed-datahub/saas-slack-setup",
+ "className": "saasOnly"
+ },
+ {
+ "type": "doc",
+ "id": "docs/managed-datahub/approval-workflows",
+ "className": "saasOnly"
+ },
+ {
+ "Metadata Ingestion With Acryl": [
+ "docs/managed-datahub/metadata-ingestion-with-acryl/ingestion"
+ ]
+ },
+ {
+ "DataHub API": [
+ {
+ "type": "doc",
+ "id": "docs/managed-datahub/datahub-api/entity-events-api",
+ "className": "saasOnly"
+ },
+ {
+ "GraphQL API": [
+ "docs/managed-datahub/datahub-api/graphql-api/getting-started",
+ {
+ "type": "doc",
+ "id": "docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta",
+ "className": "saasOnly"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "Integrations": [
+ {
+ "type": "doc",
+ "id": "docs/managed-datahub/integrations/aws-privatelink",
+ "className": "saasOnly"
+ },
+ {
+ "type": "doc",
+ "id": "docs/managed-datahub/integrations/oidc-sso-integration",
+ "className": "saasOnly"
+ }
+ ]
+ },
+ {
+ "Operator Guide": [
+ {
+ "type": "doc",
+ "id": "docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws",
+ "className": "saasOnly"
+ },
+ {
+ "type": "doc",
+ "id": "docs/managed-datahub/operator-guide/setting-up-events-api-on-aws-eventbridge",
+ "className": "saasOnly"
+ }
+ ]
+ },
+ {
+ "type": "doc",
+ "id": "docs/managed-datahub/chrome-extension",
+ "className": "saasOnly"
+ },
+ {
+ "Managed DataHub Release History": [
+ "docs/managed-datahub/release-notes/v_0_2_10",
+ "docs/managed-datahub/release-notes/v_0_2_9",
+ "docs/managed-datahub/release-notes/v_0_2_8",
+ "docs/managed-datahub/release-notes/v_0_2_7",
+ "docs/managed-datahub/release-notes/v_0_2_6",
+ "docs/managed-datahub/release-notes/v_0_2_5",
+ "docs/managed-datahub/release-notes/v_0_2_4",
+ "docs/managed-datahub/release-notes/v_0_2_3",
+ "docs/managed-datahub/release-notes/v_0_2_2",
+ "docs/managed-datahub/release-notes/v_0_2_1",
+ "docs/managed-datahub/release-notes/v_0_2_0",
+ "docs/managed-datahub/release-notes/v_0_1_73",
+ "docs/managed-datahub/release-notes/v_0_1_72",
+ "docs/managed-datahub/release-notes/v_0_1_70",
+ "docs/managed-datahub/release-notes/v_0_1_69"
+ ]
+ }
+ ]
+ },
+ {
+ "Release History": [
+ "releases"
+ ]
+ }
+ ]
+}
diff --git a/docs-website/versions.json b/docs-website/versions.json
new file mode 100644
index 00000000000000..0b79ac9498e063
--- /dev/null
+++ b/docs-website/versions.json
@@ -0,0 +1,3 @@
+[
+ "0.10.5"
+]
diff --git a/docs/actions/concepts.md b/docs/actions/concepts.md
index 381f2551d22379..5b05a0c586a5d1 100644
--- a/docs/actions/concepts.md
+++ b/docs/actions/concepts.md
@@ -40,7 +40,11 @@ The Actions Framework consists of a few core concepts--
Each of these will be described in detail below.
-![](imgs/actions.png)
+
+
+
+
+
**In the Actions Framework, Events flow continuously from left-to-right.**
### Pipelines
diff --git a/docs/advanced/no-code-modeling.md b/docs/advanced/no-code-modeling.md
index e1fadee6d371a4..d76b776d3dddb2 100644
--- a/docs/advanced/no-code-modeling.md
+++ b/docs/advanced/no-code-modeling.md
@@ -159,11 +159,19 @@ along with simplifying the number of raw data models that need defined, includin
From an architectural PoV, we will move from a before that looks something like this:
-![no-code-before](../imgs/no-code-before.png)
+
+
+
+
+
to an after that looks like this
-![no-code-after](../imgs/no-code-after.png)
+
+
+
+
+
That is, a move away from patterns of strong-typing-everywhere to a more generic + flexible world.
@@ -211,7 +219,7 @@ record ServiceKey {
* Name of the service
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true
}
name: string
diff --git a/docs/api/graphql/how-to-set-up-graphql.md b/docs/api/graphql/how-to-set-up-graphql.md
index 562e8edb9f5d9d..584bf34ad3f92d 100644
--- a/docs/api/graphql/how-to-set-up-graphql.md
+++ b/docs/api/graphql/how-to-set-up-graphql.md
@@ -62,7 +62,11 @@ Postman is a popular API client that provides a graphical user interface for sen
Within Postman, you can create a `POST` request and set the request URL to the `/api/graphql` endpoint.
In the request body, select the `GraphQL` option and enter your GraphQL query in the request body.
-![postman-graphql](../../imgs/apis/postman-graphql.png)
+
+
+
+
+
Please refer to [Querying with GraphQL](https://learning.postman.com/docs/sending-requests/graphql/graphql/) in the Postman documentation for more information.
diff --git a/docs/api/tutorials/custom-properties.md b/docs/api/tutorials/custom-properties.md
index dbc07bfaa712ee..fe0d7e62dcde83 100644
--- a/docs/api/tutorials/custom-properties.md
+++ b/docs/api/tutorials/custom-properties.md
@@ -34,7 +34,11 @@ In this example, we will add some custom properties `cluster_name` and `retentio
After you have ingested sample data, the dataset `fct_users_deleted` should have a custom properties section with `encoding` set to `utf-8`.
-![dataset-properties-before](../../imgs/apis/tutorials/dataset-properties-before.png)
+
+
+
+
+
```shell
datahub get --urn "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)" --aspect datasetProperties
@@ -80,7 +84,11 @@ The following code adds custom properties `cluster_name` and `retention_time` to
You can now see the two new properties are added to `fct_users_deleted` and the previous property `encoding` is unchanged.
-![dataset-properties-added](../../imgs/apis/tutorials/dataset-properties-added.png)
+
+
+
+
+
We can also verify this operation by programmatically checking the `datasetProperties` aspect after running this code using the `datahub` cli.
@@ -130,7 +138,11 @@ The following code shows you how can add and remove custom properties in the sam
You can now see the `cluster_name` property is added to `fct_users_deleted` and the `retention_time` property is removed.
-![dataset-properties-added-removed](../../imgs/apis/tutorials/dataset-properties-added-removed.png)
+
+
+
+
+
We can also verify this operation programmatically by checking the `datasetProperties` aspect using the `datahub` cli.
@@ -179,7 +191,11 @@ The following code replaces the current custom properties with a new properties
You can now see the `cluster_name` and `retention_time` properties are added to `fct_users_deleted` but the previous `encoding` property is no longer present.
-![dataset-properties-replaced](../../imgs/apis/tutorials/dataset-properties-replaced.png)
+
+
+
+
+
We can also verify this operation programmatically by checking the `datasetProperties` aspect using the `datahub` cli.
diff --git a/docs/api/tutorials/datasets.md b/docs/api/tutorials/datasets.md
index 62b30e97c80202..7c6d4a88d4190e 100644
--- a/docs/api/tutorials/datasets.md
+++ b/docs/api/tutorials/datasets.md
@@ -42,7 +42,11 @@ For detailed steps, please refer to [Datahub Quickstart Guide](/docs/quickstart.
You can now see `realestate_db.sales` dataset has been created.
-![dataset-created](../../imgs/apis/tutorials/dataset-created.png)
+
+
+
+
+
## Delete Dataset
@@ -110,4 +114,8 @@ Expected Response:
The dataset `fct_users_deleted` has now been deleted, so if you search for a hive dataset named `fct_users_delete`, you will no longer be able to see it.
-![dataset-deleted](../../imgs/apis/tutorials/dataset-deleted.png)
+
+
+
+
+
diff --git a/docs/api/tutorials/deprecation.md b/docs/api/tutorials/deprecation.md
index 6a8f7c8a1d2bed..73e73f5224cbc1 100644
--- a/docs/api/tutorials/deprecation.md
+++ b/docs/api/tutorials/deprecation.md
@@ -155,4 +155,8 @@ Expected Response:
You can now see the dataset `fct_users_created` has been marked as `Deprecated.`
-![tag-removed](../../imgs/apis/tutorials/deprecation-updated.png)
+
+
+
+
+
diff --git a/docs/api/tutorials/descriptions.md b/docs/api/tutorials/descriptions.md
index 46f42b7a05be6a..27c57309ba76aa 100644
--- a/docs/api/tutorials/descriptions.md
+++ b/docs/api/tutorials/descriptions.md
@@ -275,7 +275,11 @@ Expected Response:
You can now see the description is added to `fct_users_deleted`.
-![dataset-description-added](../../imgs/apis/tutorials/dataset-description-added.png)
+
+
+
+
+
## Add Description on Column
@@ -357,4 +361,8 @@ Expected Response:
You can now see column description is added to `user_name` column of `fct_users_deleted`.
-![column-description-added](../../imgs/apis/tutorials/column-description-added.png)
+
+
+
+
+
diff --git a/docs/api/tutorials/domains.md b/docs/api/tutorials/domains.md
index c8c47f85c570f8..617864d233b7a6 100644
--- a/docs/api/tutorials/domains.md
+++ b/docs/api/tutorials/domains.md
@@ -74,7 +74,11 @@ Expected Response:
You can now see `Marketing` domain has been created under `Govern > Domains`.
-![domain-created](../../imgs/apis/tutorials/domain-created.png)
+
+
+
+
+
## Read Domains
@@ -209,7 +213,11 @@ Expected Response:
You can now see `Marketing` domain has been added to the dataset.
-![domain-added](../../imgs/apis/tutorials/domain-added.png)
+
+
+
+
+
## Remove Domains
@@ -259,4 +267,8 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \
You can now see a domain `Marketing` has been removed from the `fct_users_created` dataset.
-![domain-removed](../../imgs/apis/tutorials/domain-removed.png)
+
+
+
+
+
diff --git a/docs/api/tutorials/lineage.md b/docs/api/tutorials/lineage.md
index e37986af7bbbd4..ce23a4d274e8ec 100644
--- a/docs/api/tutorials/lineage.md
+++ b/docs/api/tutorials/lineage.md
@@ -112,7 +112,11 @@ Expected Response:
You can now see the lineage between `fct_users_deleted` and `logging_events`.
-![lineage-added](../../imgs/apis/tutorials/lineage-added.png)
+
+
+
+
+
## Add Column-level Lineage
@@ -130,7 +134,11 @@ You can now see the lineage between `fct_users_deleted` and `logging_events`.
You can now see the column-level lineage between datasets. Note that you have to enable `Show Columns` to be able to see the column-level lineage.
-![column-level-lineage-added](../../imgs/apis/tutorials/column-level-lineage-added.png)
+
+
+
+
+
## Read Lineage
diff --git a/docs/api/tutorials/ml.md b/docs/api/tutorials/ml.md
index b16f2669b30c74..cb77556d48ebf5 100644
--- a/docs/api/tutorials/ml.md
+++ b/docs/api/tutorials/ml.md
@@ -94,9 +94,17 @@ Please note that an MlModelGroup serves as a container for all the runs of a sin
You can search the entities in DataHub UI.
-![feature-table-created](../../imgs/apis/tutorials/feature-table-created.png)
-![model-group-created](../../imgs/apis/tutorials/model-group-created.png)
+
+
+
+
+
+
+
+
+
+
## Read ML Entities
@@ -499,6 +507,14 @@ Expected Response: (Note that this entity does not exist in the sample ingestion
You can access to `Features` or `Group` Tab of each entity to view the added entities.
-![feature-added-to-model](../../imgs/apis/tutorials/feature-added-to-model.png)
-![model-group-added-to-model](../../imgs/apis/tutorials/model-group-added-to-model.png)
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/api/tutorials/owners.md b/docs/api/tutorials/owners.md
index 3c7a46b136d76c..5bc3b95cb5631b 100644
--- a/docs/api/tutorials/owners.md
+++ b/docs/api/tutorials/owners.md
@@ -77,7 +77,11 @@ Update succeeded for urn urn:li:corpuser:datahub.
### Expected Outcomes of Upserting User
You can see the user `The bar` has been created and the user `Datahub` has been updated under `Settings > Access > Users & Groups`
-![user-upserted](../../imgs/apis/tutorials/user-upserted.png)
+
+
+
+
+
## Upsert Group
@@ -125,7 +129,11 @@ Update succeeded for group urn:li:corpGroup:foogroup@acryl.io.
### Expected Outcomes of Upserting Group
You can see the group `Foo Group` has been created under `Settings > Access > Users & Groups`
-![group-upserted](../../imgs/apis/tutorials/group-upserted.png)
+
+
+
+
+
## Read Owners
@@ -272,7 +280,11 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \
You can now see `bfoo` has been added as an owner to the `fct_users_created` dataset.
-![ownership-added](../../imgs/apis/tutorials/owner-added.png)
+
+
+
+
+
## Remove Owners
@@ -340,4 +352,8 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \
You can now see `John Doe` has been removed as an owner from the `fct_users_created` dataset.
-![ownership-removed](../../imgs/apis/tutorials/owner-removed.png)
+
+
+
+
+
diff --git a/docs/api/tutorials/tags.md b/docs/api/tutorials/tags.md
index 2f80a833136c1a..b2234bf00bcb92 100644
--- a/docs/api/tutorials/tags.md
+++ b/docs/api/tutorials/tags.md
@@ -91,7 +91,11 @@ Expected Response:
You can now see the new tag `Deprecated` has been created.
-![tag-created](../../imgs/apis/tutorials/tag-created.png)
+
+
+
+
+
We can also verify this operation by programmatically searching `Deprecated` tag after running this code using the `datahub` cli.
@@ -307,7 +311,11 @@ Expected Response:
You can now see `Deprecated` tag has been added to `user_name` column.
-![tag-added](../../imgs/apis/tutorials/tag-added.png)
+
+
+
+
+
We can also verify this operation programmatically by checking the `globalTags` aspect using the `datahub` cli.
@@ -359,7 +367,11 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \
You can now see `Deprecated` tag has been removed to `user_name` column.
-![tag-removed](../../imgs/apis/tutorials/tag-removed.png)
+
+
+
+
+
We can also verify this operation programmatically by checking the `gloablTags` aspect using the `datahub` cli.
diff --git a/docs/api/tutorials/terms.md b/docs/api/tutorials/terms.md
index 207e14ea4afe87..99acf77d26ab04 100644
--- a/docs/api/tutorials/terms.md
+++ b/docs/api/tutorials/terms.md
@@ -95,7 +95,11 @@ Expected Response:
You can now see the new term `Rate of Return` has been created.
-![term-created](../../imgs/apis/tutorials/term-created.png)
+
+
+
+
+
We can also verify this operation by programmatically searching `Rate of Return` term after running this code using the `datahub` cli.
@@ -289,7 +293,11 @@ Expected Response:
You can now see `Rate of Return` term has been added to `user_name` column.
-![term-added](../../imgs/apis/tutorials/term-added.png)
+
+
+
+
+
## Remove Terms
@@ -361,4 +369,8 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \
You can now see `Rate of Return` term has been removed to `user_name` column.
-![term-removed](../../imgs/apis/tutorials/term-removed.png)
+
+
+
+
+
diff --git a/docs/architecture/architecture.md b/docs/architecture/architecture.md
index 6b76b995cc4275..6a9c1860d71b09 100644
--- a/docs/architecture/architecture.md
+++ b/docs/architecture/architecture.md
@@ -10,8 +10,16 @@ disparate tools & systems.
The figures below describe the high-level architecture of DataHub.
-![datahub-architecture](../imgs/datahub-architecture.png)
-![Acryl DataHub System Architecture ](../managed-datahub/imgs/saas/DataHub-Architecture.png)
+
+
+
+
+
+
+
+
+
+
For a more detailed look at the components that make up the Architecture, check out [Components](../components.md).
diff --git a/docs/architecture/metadata-ingestion.md b/docs/architecture/metadata-ingestion.md
index 2b60383319c684..abf8fc24d13856 100644
--- a/docs/architecture/metadata-ingestion.md
+++ b/docs/architecture/metadata-ingestion.md
@@ -6,7 +6,11 @@ title: "Ingestion Framework"
DataHub supports an extremely flexible ingestion architecture that can support push, pull, asynchronous and synchronous models.
The figure below describes all the options possible for connecting your favorite system to DataHub.
-![Ingestion Architecture](../imgs/ingestion-architecture.png)
+
+
+
+
+
## Metadata Change Proposal: The Center Piece
diff --git a/docs/architecture/metadata-serving.md b/docs/architecture/metadata-serving.md
index ada41179af4e0a..57194f49d5ea41 100644
--- a/docs/architecture/metadata-serving.md
+++ b/docs/architecture/metadata-serving.md
@@ -6,7 +6,11 @@ title: "Serving Tier"
The figure below shows the high-level system diagram for DataHub's Serving Tier.
-![datahub-serving](../imgs/datahub-serving.png)
+
+
+
+
+
The primary component is called [the Metadata Service](../../metadata-service) and exposes a REST API and a GraphQL API for performing CRUD operations on metadata. The service also exposes search and graph query API-s to support secondary-index style queries, full-text search queries as well as relationship queries like lineage. In addition, the [datahub-frontend](../../datahub-frontend) service expose a GraphQL API on top of the metadata graph.
diff --git a/docs/authentication/concepts.md b/docs/authentication/concepts.md
index 715e94c7e03808..0940f86a805f19 100644
--- a/docs/authentication/concepts.md
+++ b/docs/authentication/concepts.md
@@ -11,7 +11,11 @@ We introduced a few important concepts to the Metadata Service to make authentic
In following sections, we'll take a closer look at each individually.
-![](../imgs/metadata-service-auth.png)
+
+
+
+
+
*High level overview of Metadata Service Authentication*
## What is an Actor?
diff --git a/docs/authentication/guides/sso/configure-oidc-react-azure.md b/docs/authentication/guides/sso/configure-oidc-react-azure.md
index d1859579678821..177387327c0e8e 100644
--- a/docs/authentication/guides/sso/configure-oidc-react-azure.md
+++ b/docs/authentication/guides/sso/configure-oidc-react-azure.md
@@ -32,7 +32,11 @@ Azure supports more than one redirect URI, so both can be configured at the same
At this point, your app registration should look like the following:
-![azure-setup-app-registration](img/azure-setup-app-registration.png)
+
+
+
+
+
e. Click **Register**.
@@ -40,7 +44,11 @@ e. Click **Register**.
Once registration is done, you will land on the app registration **Overview** tab. On the left-side navigation bar, click on **Authentication** under **Manage** and add extra redirect URIs if need be (if you want to support both local testing and Azure deployments).
-![azure-setup-authentication](img/azure-setup-authentication.png)
+
+
+
+
+
Click **Save**.
@@ -51,7 +59,11 @@ Select **Client secrets**, then **New client secret**. Type in a meaningful des
**IMPORTANT:** Copy the `value` of your newly create secret since Azure will never display its value afterwards.
-![azure-setup-certificates-secrets](img/azure-setup-certificates-secrets.png)
+
+
+
+
+
### 4. Configure API permissions
@@ -66,7 +78,11 @@ Click on **Add a permission**, then from the **Microsoft APIs** tab select **Mic
At this point, you should be looking at a screen like the following:
-![azure-setup-api-permissions](img/azure-setup-api-permissions.png)
+
+
+
+
+
### 5. Obtain Application (Client) ID
diff --git a/docs/authentication/guides/sso/configure-oidc-react-google.md b/docs/authentication/guides/sso/configure-oidc-react-google.md
index 474538097aae20..af62185e6e7872 100644
--- a/docs/authentication/guides/sso/configure-oidc-react-google.md
+++ b/docs/authentication/guides/sso/configure-oidc-react-google.md
@@ -31,7 +31,11 @@ Note that in order to complete this step you should be logged into a Google acco
c. Fill out the details in the App Information & Domain sections. Make sure the 'Application Home Page' provided matches where DataHub is deployed
at your organization.
-![google-setup-1](img/google-setup-1.png)
+
+
+
+
+
Once you've completed this, **Save & Continue**.
@@ -70,7 +74,11 @@ f. You will now receive a pair of values, a client id and a client secret. Bookm
At this point, you should be looking at a screen like the following:
-![google-setup-2](img/google-setup-2.png)
+
+
+
+
+
Success!
diff --git a/docs/authentication/guides/sso/configure-oidc-react-okta.md b/docs/authentication/guides/sso/configure-oidc-react-okta.md
index cfede999f1e700..320b887a28f163 100644
--- a/docs/authentication/guides/sso/configure-oidc-react-okta.md
+++ b/docs/authentication/guides/sso/configure-oidc-react-okta.md
@@ -69,8 +69,16 @@ for example, `https://dev-33231928.okta.com/.well-known/openid-configuration`.
At this point, you should be looking at a screen like the following:
-![okta-setup-1](img/okta-setup-1.png)
-![okta-setup-2](img/okta-setup-2.png)
+
+
+
+
+
+
+
+
+
+
Success!
@@ -96,7 +104,11 @@ Replacing the placeholders above with the client id & client secret received fro
>
> By default, we assume that the groups will appear in a claim named "groups". This can be customized using the `AUTH_OIDC_GROUPS_CLAIM` container configuration.
>
-> ![okta-setup-2](img/okta-setup-groups-claim.png)
+>
+
+
+
+
### 5. Restart `datahub-frontend-react` docker container
diff --git a/docs/authentication/guides/sso/img/azure-setup-api-permissions.png b/docs/authentication/guides/sso/img/azure-setup-api-permissions.png
deleted file mode 100755
index 4964b7d48ffec2..00000000000000
Binary files a/docs/authentication/guides/sso/img/azure-setup-api-permissions.png and /dev/null differ
diff --git a/docs/authentication/guides/sso/img/azure-setup-app-registration.png b/docs/authentication/guides/sso/img/azure-setup-app-registration.png
deleted file mode 100755
index ffb23a7e3ddec5..00000000000000
Binary files a/docs/authentication/guides/sso/img/azure-setup-app-registration.png and /dev/null differ
diff --git a/docs/authentication/guides/sso/img/azure-setup-authentication.png b/docs/authentication/guides/sso/img/azure-setup-authentication.png
deleted file mode 100755
index 2d27ec88fb40b9..00000000000000
Binary files a/docs/authentication/guides/sso/img/azure-setup-authentication.png and /dev/null differ
diff --git a/docs/authentication/guides/sso/img/azure-setup-certificates-secrets.png b/docs/authentication/guides/sso/img/azure-setup-certificates-secrets.png
deleted file mode 100755
index db6585d84d8eeb..00000000000000
Binary files a/docs/authentication/guides/sso/img/azure-setup-certificates-secrets.png and /dev/null differ
diff --git a/docs/authentication/guides/sso/img/google-setup-1.png b/docs/authentication/guides/sso/img/google-setup-1.png
deleted file mode 100644
index 88c674146f1e44..00000000000000
Binary files a/docs/authentication/guides/sso/img/google-setup-1.png and /dev/null differ
diff --git a/docs/authentication/guides/sso/img/google-setup-2.png b/docs/authentication/guides/sso/img/google-setup-2.png
deleted file mode 100644
index 850512b891d5f3..00000000000000
Binary files a/docs/authentication/guides/sso/img/google-setup-2.png and /dev/null differ
diff --git a/docs/authentication/guides/sso/img/okta-setup-1.png b/docs/authentication/guides/sso/img/okta-setup-1.png
deleted file mode 100644
index 3949f18657c5ec..00000000000000
Binary files a/docs/authentication/guides/sso/img/okta-setup-1.png and /dev/null differ
diff --git a/docs/authentication/guides/sso/img/okta-setup-2.png b/docs/authentication/guides/sso/img/okta-setup-2.png
deleted file mode 100644
index fa6ea4d9918948..00000000000000
Binary files a/docs/authentication/guides/sso/img/okta-setup-2.png and /dev/null differ
diff --git a/docs/authentication/guides/sso/img/okta-setup-groups-claim.png b/docs/authentication/guides/sso/img/okta-setup-groups-claim.png
deleted file mode 100644
index ed35426685e467..00000000000000
Binary files a/docs/authentication/guides/sso/img/okta-setup-groups-claim.png and /dev/null differ
diff --git a/docs/authentication/personal-access-tokens.md b/docs/authentication/personal-access-tokens.md
index 0188aab49444ea..dc57a989a4e0c8 100644
--- a/docs/authentication/personal-access-tokens.md
+++ b/docs/authentication/personal-access-tokens.md
@@ -71,7 +71,11 @@ curl 'http://localhost:8080/entities/urn:li:corpuser:datahub' -H 'Authorization:
Since authorization happens at the GMS level, this means that ingestion is also protected behind access tokens, to use them simply add a `token` to the sink config property as seen below:
-![](../imgs/ingestion-with-token.png)
+
+
+
+
+
:::note
diff --git a/docs/authorization/access-policies-guide.md b/docs/authorization/access-policies-guide.md
index 5820e513a83e30..1eabb64d2878f6 100644
--- a/docs/authorization/access-policies-guide.md
+++ b/docs/authorization/access-policies-guide.md
@@ -110,10 +110,13 @@ In the second step, we can simply select the Privileges that this Platform Polic
| Manage Tags | Allow the actor to create and remove any Tags |
| Manage Public Views | Allow the actor to create, edit, and remove any public (shared) Views. |
| Manage Ownership Types | Allow the actor to create, edit, and remove any Ownership Types. |
+| Manage Platform Settings | (Acryl DataHub only) Allow the actor to manage global integrations and notification settings |
+| Manage Monitors | (Acryl DataHub only) Allow the actor to create, remove, start, or stop any entity assertion monitors |
| Restore Indices API[^1] | Allow the actor to restore indices for a set of entities via API |
| Enable/Disable Writeability API[^1] | Allow the actor to enable or disable GMS writeability for use in data migrations |
| Apply Retention API[^1] | Allow the actor to apply aspect retention via API |
+
[^1]: Only active if REST_API_AUTHORIZATION_ENABLED environment flag is enabled
#### Step 3: Choose Policy Actors
@@ -204,8 +207,15 @@ The common Metadata Privileges, which span across entity types, include:
| Edit Status | Allow actor to edit the status of an entity (soft deleted or not). |
| Edit Domain | Allow actor to edit the Domain of an entity. |
| Edit Deprecation | Allow actor to edit the Deprecation status of an entity. |
-| Edit Assertions | Allow actor to add and remove assertions from an entity. |
-| Edit All | Allow actor to edit any information about an entity. Super user privileges. Controls the ability to ingest using API when REST API Authorization is enabled. |
+| Edit Lineage | Allow actor to edit custom lineage edges for the entity. |
+| Edit Data Product | Allow actor to edit the data product that an entity is part of |
+| Propose Tags | (Acryl DataHub only) Allow actor to propose new Tags for the entity. |
+| Propose Glossary Terms | (Acryl DataHub only) Allow actor to propose new Glossary Terms for the entity. |
+| Propose Documentation | (Acryl DataHub only) Allow actor to propose new Documentation for the entity. |
+| Manage Tag Proposals | (Acryl DataHub only) Allow actor to accept or reject proposed Tags for the entity. |
+| Manage Glossary Terms Proposals | (Acryl DataHub only) Allow actor to accept or reject proposed Glossary Terms for the entity. |
+| Manage Documentation Proposals | (Acryl DataHub only) Allow actor to accept or reject proposed Documentation for the entity |
+| Edit Entity | Allow actor to edit any information about an entity. Super user privileges. Controls the ability to ingest using API when REST API Authorization is enabled. |
| Get Timeline API[^1] | Allow actor to get the timeline of an entity via API. |
| Get Entity API[^1] | Allow actor to get an entity via API. |
| Get Timeseries Aspect API[^1] | Allow actor to get a timeseries aspect via API. |
@@ -225,10 +235,19 @@ The common Metadata Privileges, which span across entity types, include:
| Dataset | Edit Dataset Queries | Allow actor to edit the Highlighted Queries on the Queries tab of the dataset. |
| Dataset | View Dataset Usage | Allow actor to access usage metadata about a dataset both in the UI and in the GraphQL API. This includes example queries, number of queries, etc. Also applies to REST APIs when REST API Authorization is enabled. |
| Dataset | View Dataset Profile | Allow actor to access a dataset's profile both in the UI and in the GraphQL API. This includes snapshot statistics like #rows, #columns, null percentage per field, etc. |
+| Dataset | Edit Assertions | Allow actor to change the assertions associated with a dataset. |
+| Dataset | Edit Incidents | (Acryl DataHub only) Allow actor to change the incidents associated with a dataset. |
+| Dataset | Edit Monitors | (Acryl DataHub only) Allow actor to change the assertion monitors associated with a dataset. |
| Tag | Edit Tag Color | Allow actor to change the color of a Tag. |
| Group | Edit Group Members | Allow actor to add and remove members to a group. |
+| Group | Edit Contact Information | Allow actor to change email, slack handle associated with the group. |
+| Group | Manage Group Subscriptions | (Acryl DataHub only) Allow actor to subscribe the group to entities. |
+| Group | Manage Group Notifications | (Acryl DataHub only) Allow actor to change notification settings for the group. |
| User | Edit User Profile | Allow actor to change the user's profile including display name, bio, title, profile image, etc. |
| User + Group | Edit Contact Information | Allow actor to change the contact information such as email & chat handles. |
+| Term Group | Manage Direct Glossary Children | Allow actor to change the direct child Term Groups or Terms of the group. |
+| Term Group | Manage All Glossary Children | Allow actor to change any direct or indirect child Term Groups or Terms of the group. |
+
> **Still have questions about Privileges?** Let us know in [Slack](https://slack.datahubproject.io)!
diff --git a/docs/components.md b/docs/components.md
index ef76729bb37fbf..b59dabcf999cce 100644
--- a/docs/components.md
+++ b/docs/components.md
@@ -6,7 +6,11 @@ title: "Components"
The DataHub platform consists of the components shown in the following diagram.
-![DataHub Component Overview](./imgs/datahub-components.png)
+
+
+
+
+
## Metadata Store
diff --git a/docs/demo/DataHub-UIOverview.pdf b/docs/demo/DataHub-UIOverview.pdf
deleted file mode 100644
index cd6106e84ac236..00000000000000
Binary files a/docs/demo/DataHub-UIOverview.pdf and /dev/null differ
diff --git a/docs/demo/DataHub_-_Powering_LinkedIn_Metadata.pdf b/docs/demo/DataHub_-_Powering_LinkedIn_Metadata.pdf
deleted file mode 100644
index 71498045f9b5bf..00000000000000
Binary files a/docs/demo/DataHub_-_Powering_LinkedIn_Metadata.pdf and /dev/null differ
diff --git a/docs/demo/Data_Discoverability_at_SpotHero.pdf b/docs/demo/Data_Discoverability_at_SpotHero.pdf
deleted file mode 100644
index 83e37d8606428a..00000000000000
Binary files a/docs/demo/Data_Discoverability_at_SpotHero.pdf and /dev/null differ
diff --git a/docs/demo/Datahub_-_Strongly_Consistent_Secondary_Indexing.pdf b/docs/demo/Datahub_-_Strongly_Consistent_Secondary_Indexing.pdf
deleted file mode 100644
index 2d6a33a464650e..00000000000000
Binary files a/docs/demo/Datahub_-_Strongly_Consistent_Secondary_Indexing.pdf and /dev/null differ
diff --git a/docs/demo/Datahub_at_Grofers.pdf b/docs/demo/Datahub_at_Grofers.pdf
deleted file mode 100644
index c29cece9e250ac..00000000000000
Binary files a/docs/demo/Datahub_at_Grofers.pdf and /dev/null differ
diff --git a/docs/demo/Designing_the_next_generation_of_metadata_events_for_scale.pdf b/docs/demo/Designing_the_next_generation_of_metadata_events_for_scale.pdf
deleted file mode 100644
index 0d067eef28d03b..00000000000000
Binary files a/docs/demo/Designing_the_next_generation_of_metadata_events_for_scale.pdf and /dev/null differ
diff --git a/docs/demo/Metadata_Use-Cases_at_LinkedIn_-_Lightning_Talk.pdf b/docs/demo/Metadata_Use-Cases_at_LinkedIn_-_Lightning_Talk.pdf
deleted file mode 100644
index 382754f863c8a3..00000000000000
Binary files a/docs/demo/Metadata_Use-Cases_at_LinkedIn_-_Lightning_Talk.pdf and /dev/null differ
diff --git a/docs/demo/Saxo Bank Data Workbench.pdf b/docs/demo/Saxo Bank Data Workbench.pdf
deleted file mode 100644
index c43480d32b8f24..00000000000000
Binary files a/docs/demo/Saxo Bank Data Workbench.pdf and /dev/null differ
diff --git a/docs/demo/Taming the Data Beast Using DataHub.pdf b/docs/demo/Taming the Data Beast Using DataHub.pdf
deleted file mode 100644
index d0062465d92200..00000000000000
Binary files a/docs/demo/Taming the Data Beast Using DataHub.pdf and /dev/null differ
diff --git a/docs/demo/Town_Hall_Presentation_-_12-2020_-_UI_Development_Part_2.pdf b/docs/demo/Town_Hall_Presentation_-_12-2020_-_UI_Development_Part_2.pdf
deleted file mode 100644
index fb7bd2b693e877..00000000000000
Binary files a/docs/demo/Town_Hall_Presentation_-_12-2020_-_UI_Development_Part_2.pdf and /dev/null differ
diff --git a/docs/demo/ViasatMetadataJourney.pdf b/docs/demo/ViasatMetadataJourney.pdf
deleted file mode 100644
index ccffd18a06d187..00000000000000
Binary files a/docs/demo/ViasatMetadataJourney.pdf and /dev/null differ
diff --git a/docs/deploy/aws.md b/docs/deploy/aws.md
index 7b01ffa02a7446..228fcb51d1a28f 100644
--- a/docs/deploy/aws.md
+++ b/docs/deploy/aws.md
@@ -201,7 +201,11 @@ Provision a MySQL database in AWS RDS that shares the VPC with the kubernetes cl
the VPC of the kubernetes cluster. Once the database is provisioned, you should be able to see the following page. Take
a note of the endpoint marked by the red box.
-![AWS RDS](../imgs/aws/aws-rds.png)
+
+
+
+
+
First, add the DB password to kubernetes by running the following.
@@ -234,7 +238,11 @@ Provision an elasticsearch domain running elasticsearch version 7.10 or above th
cluster or has VPC peering set up between the VPC of the kubernetes cluster. Once the domain is provisioned, you should
be able to see the following page. Take a note of the endpoint marked by the red box.
-![AWS Elasticsearch Service](../imgs/aws/aws-elasticsearch.png)
+
+
+
+
+
Update the elasticsearch settings under global in the values.yaml as follows.
@@ -330,7 +338,11 @@ Provision an MSK cluster that shares the VPC with the kubernetes cluster or has
the kubernetes cluster. Once the domain is provisioned, click on the “View client information” button in the ‘Cluster
Summary” section. You should see a page like below. Take a note of the endpoints marked by the red boxes.
-![AWS MSK](../imgs/aws/aws-msk.png)
+
+
+
+
+
Update the kafka settings under global in the values.yaml as follows.
diff --git a/docs/deploy/confluent-cloud.md b/docs/deploy/confluent-cloud.md
index d93ffcceaecee1..794b55d4686bfb 100644
--- a/docs/deploy/confluent-cloud.md
+++ b/docs/deploy/confluent-cloud.md
@@ -24,7 +24,11 @@ decommissioned.
To create the topics, navigate to your **Cluster** and click "Create Topic". Feel free to tweak the default topic configurations to
match your preferences.
-![CreateTopic](../imgs/confluent-create-topic.png)
+
+
+
+
+
## Step 2: Configure DataHub Container to use Confluent Cloud Topics
@@ -140,12 +144,20 @@ and another for the user info used for connecting to the schema registry. You'll
select "Clients" -> "Configure new Java Client". You should see a page like the following:
-![Config](../imgs/confluent-cloud-config.png)
+
+
+
+
+
You'll want to generate both a Kafka Cluster API Key & a Schema Registry key. Once you do so,you should see the config
automatically populate with your new secrets:
-![Config](../imgs/confluent-cloud-config-2.png)
+
+
+
+
+
You'll need to copy the values of `sasl.jaas.config` and `basic.auth.user.info`
for the next step.
diff --git a/docs/deploy/gcp.md b/docs/deploy/gcp.md
index 3713d69f90636c..0cd3d92a8f3cd3 100644
--- a/docs/deploy/gcp.md
+++ b/docs/deploy/gcp.md
@@ -65,16 +65,28 @@ the GKE page on [GCP website](https://console.cloud.google.com/kubernetes/discov
Once all deploy is successful, you should see a page like below in the "Services & Ingress" tab on the left.
-![Services and Ingress](../imgs/gcp/services_ingress.png)
+
+
+
+
+
Tick the checkbox for datahub-datahub-frontend and click "CREATE INGRESS" button. You should land on the following page.
-![Ingress1](../imgs/gcp/ingress1.png)
+
+
+
+
+
Type in an arbitrary name for the ingress and click on the second step "Host and path rules". You should land on the
following page.
-![Ingress2](../imgs/gcp/ingress2.png)
+
+
+
+
+
Select "datahub-datahub-frontend" in the dropdown menu for backends, and then click on "ADD HOST AND PATH RULE" button.
In the second row that got created, add in the host name of choice (here gcp.datahubproject.io) and select
@@ -83,14 +95,22 @@ In the second row that got created, add in the host name of choice (here gcp.dat
This step adds the rule allowing requests from the host name of choice to get routed to datahub-frontend service. Click
on step 3 "Frontend configuration". You should land on the following page.
-![Ingress3](../imgs/gcp/ingress3.png)
+
+
+
+
+
Choose HTTPS in the dropdown menu for protocol. To enable SSL, you need to add a certificate. If you do not have one,
you can click "CREATE A NEW CERTIFICATE" and input the host name of choice. GCP will create a certificate for you.
Now press "CREATE" button on the left to create ingress! After around 5 minutes, you should see the following.
-![Ingress Ready](../imgs/gcp/ingress_ready.png)
+
+
+
+
+
In your domain provider, add an A record for the host name set above using the IP address on the ingress page (noted
with the red box). Once DNS updates, you should be able to access DataHub through the host name!!
@@ -98,5 +118,9 @@ with the red box). Once DNS updates, you should be able to access DataHub throug
Note, ignore the warning icon next to ingress. It takes about ten minutes for ingress to check that the backend service
is ready and show a check mark as follows. However, ingress is fully functional once you see the above page.
-![Ingress Final](../imgs/gcp/ingress_final.png)
+
+
+
+
+
diff --git a/docs/dev-guides/timeline.md b/docs/dev-guides/timeline.md
index 966e659b909915..829aef1d3eefa1 100644
--- a/docs/dev-guides/timeline.md
+++ b/docs/dev-guides/timeline.md
@@ -14,7 +14,11 @@ The Timeline API is available in server versions `0.8.28` and higher. The `cli`
## Entity Timeline Conceptually
For the visually inclined, here is a conceptual diagram that illustrates how to think about the entity timeline with categorical changes overlaid on it.
-![../imgs/timeline/timeline-conceptually.png](../imgs/timeline/timeline-conceptually.png)
+
+
+
+
+
## Change Event
Each modification is modeled as a
@@ -228,8 +232,16 @@ http://localhost:8080/openapi/timeline/v1/urn%3Ali%3Adataset%3A%28urn%3Ali%3Adat
The API is browse-able via the UI through through the dropdown.
Here are a few screenshots showing how to navigate to it. You can try out the API and send example requests.
-![../imgs/timeline/dropdown-apis.png](../imgs/timeline/dropdown-apis.png)
-![../imgs/timeline/swagger-ui.png](../imgs/timeline/swagger-ui.png)
+
+
+
+
+
+
+
+
+
+
# Future Work
diff --git a/docs/docker/development.md b/docs/docker/development.md
index 2153aa9dc613f1..91a303744a03bd 100644
--- a/docs/docker/development.md
+++ b/docs/docker/development.md
@@ -92,7 +92,11 @@ Environment variables control the debugging ports for GMS and the frontend.
The screenshot shows an example configuration for IntelliJ using the default GMS debugging port of 5001.
-![](../imgs/development/intellij-remote-debug.png)
+
+
+
+
+
## Tips for People New To Docker
diff --git a/docs/glossary/business-glossary.md b/docs/glossary/business-glossary.md
index faab6f12fc55e7..e10cbed30b9132 100644
--- a/docs/glossary/business-glossary.md
+++ b/docs/glossary/business-glossary.md
@@ -31,59 +31,103 @@ In order to view a Business Glossary, users must have the Platform Privilege cal
Once granted this privilege, you can access your Glossary by clicking the dropdown at the top of the page called **Govern** and then click **Glossary**:
-![](../imgs/glossary/glossary-button.png)
+
+
+
+
+
You are now at the root of your Glossary and should see all Terms and Term Groups with no parents assigned to them. You should also notice a hierarchy navigator on the left where you can easily check out the structure of your Glossary!
-![](../imgs/glossary/root-glossary.png)
+
+
+
+
+
## Creating a Term or Term Group
There are two ways to create Terms and Term Groups through the UI. First, you can create directly from the Glossary home page by clicking the menu dots on the top right and selecting your desired option:
-![](../imgs/glossary/root-glossary-create.png)
+
+
+
+
+
You can also create Terms or Term Groups directly from a Term Group's page. In order to do that you need to click the menu dots on the top right and select what you want:
-![](../imgs/glossary/create-from-node.png)
+
+
+
+
+
Note that the modal that pops up will automatically set the current Term Group you are in as the **Parent**. You can easily change this by selecting the input and navigating through your Glossary to find your desired Term Group. In addition, you could start typing the name of a Term Group to see it appear by searching. You can also leave this input blank in order to create a Term or Term Group with no parent.
-![](../imgs/glossary/create-modal.png)
+
+
+
+
+
## Editing a Term or Term Group
In order to edit a Term or Term Group, you first need to go the page of the Term or Term group you want to edit. Then simply click the edit icon right next to the name to open up an inline editor. Change the text and it will save when you click outside or hit Enter.
-![](../imgs/glossary/edit-term.png)
+
+
+
+
+
## Moving a Term or Term Group
Once a Term or Term Group has been created, you can always move it to be under a different Term Group parent. In order to do this, click the menu dots on the top right of either entity and select **Move**.
-![](../imgs/glossary/move-term-button.png)
+
+
+
+
+
This will open a modal where you can navigate through your Glossary to find your desired Term Group.
-![](../imgs/glossary/move-term-modal.png)
+
+
+
+
+
## Deleting a Term or Term Group
In order to delete a Term or Term Group, you need to go to the entity page of what you want to delete then click the menu dots on the top right. From here you can select **Delete** followed by confirming through a separate modal. **Note**: at the moment we only support deleting Term Groups that do not have any children. Until cascade deleting is supported, you will have to delete all children first, then delete the Term Group.
-![](../imgs/glossary/delete-button.png)
+
+
+
+
+
## Adding a Term to an Entity
Once you've defined your Glossary, you can begin attaching terms to data assets. To add a Glossary Term to an asset, go to the entity page of your asset and find the **Add Terms** button on the right sidebar.
-![](../imgs/glossary/add-term-to-entity.png)
+
+
+
+
+
In the modal that pops up you can select the Term you care about in one of two ways:
- Search for the Term by name in the input
- Navigate through the Glossary dropdown that appears after clicking into the input
-![](../imgs/glossary/add-term-modal.png)
+
+
+
+
+
## Privileges
diff --git a/docs/how/configuring-authorization-with-apache-ranger.md b/docs/how/configuring-authorization-with-apache-ranger.md
index 26d3be6d358b2e..46f9432e6c18a7 100644
--- a/docs/how/configuring-authorization-with-apache-ranger.md
+++ b/docs/how/configuring-authorization-with-apache-ranger.md
@@ -67,7 +67,11 @@ Now, you should have the DataHub plugin registered with Apache Ranger. Next, we'
**DATAHUB** plugin and **ranger_datahub** service is shown in below screenshot:
- ![Privacera Portal DATAHUB screenshot](../imgs/apache-ranger/datahub-plugin.png)
+
+
+
+
+
4. Create a new policy under service **ranger_datahub** - this will be used to control DataHub authorization.
5. Create a test user & assign them to a policy. We'll use the `datahub` user, which is the default root user inside DataHub.
@@ -80,7 +84,11 @@ Now, you should have the DataHub plugin registered with Apache Ranger. Next, we'
DataHub platform access policy screenshot:
- ![Privacera Portal DATAHUB screenshot](../imgs/apache-ranger/datahub-platform-access-policy.png)
+
+
+
+
+
Once we've created our first policy, we can set up DataHub to start authorizing requests using Ranger policies.
@@ -178,7 +186,11 @@ then follow the below sections to undo the configuration steps you have performe
**ranger_datahub** service is shown in below screenshot:
- ![Privacera Portal DATAHUB screenshot](../imgs/apache-ranger/datahub-plugin.png)
+
+
+
+
+
2. Delete **datahub** plugin: Execute below curl command to delete **datahub** plugin
Replace variables with corresponding values in curl command
diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
index 2b6fd5571cc9ec..7ba516c82cf1b7 100644
--- a/docs/how/updating-datahub.md
+++ b/docs/how/updating-datahub.md
@@ -15,6 +15,9 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
- #8300: Clickhouse source now inherited from TwoTierSQLAlchemy. In old way we have platform_instance -> container -> co
container db (None) -> container schema and now we have platform_instance -> container database.
- #8300: Added `uri_opts` argument; now we can add any options for clickhouse client.
+- #8659: BigQuery ingestion no longer creates DataPlatformInstance aspects by default.
+ This will only affect users that were depending on this aspect for custom functionality,
+ and can be enabled via the `include_data_platform_instance` config option.
## 0.10.5
diff --git a/docs/imgs/add-schema-tag.png b/docs/imgs/add-schema-tag.png
deleted file mode 100644
index b6fd273389c904..00000000000000
Binary files a/docs/imgs/add-schema-tag.png and /dev/null differ
diff --git a/docs/imgs/add-tag-search.png b/docs/imgs/add-tag-search.png
deleted file mode 100644
index a129f5eba4271b..00000000000000
Binary files a/docs/imgs/add-tag-search.png and /dev/null differ
diff --git a/docs/imgs/add-tag.png b/docs/imgs/add-tag.png
deleted file mode 100644
index 386b4cdcd99113..00000000000000
Binary files a/docs/imgs/add-tag.png and /dev/null differ
diff --git a/docs/imgs/added-tag.png b/docs/imgs/added-tag.png
deleted file mode 100644
index 96ae48318a35a1..00000000000000
Binary files a/docs/imgs/added-tag.png and /dev/null differ
diff --git a/docs/imgs/airflow/connection_error.png b/docs/imgs/airflow/connection_error.png
deleted file mode 100644
index c2f3344b8cc452..00000000000000
Binary files a/docs/imgs/airflow/connection_error.png and /dev/null differ
diff --git a/docs/imgs/airflow/datahub_lineage_view.png b/docs/imgs/airflow/datahub_lineage_view.png
deleted file mode 100644
index c7c774c203d2f2..00000000000000
Binary files a/docs/imgs/airflow/datahub_lineage_view.png and /dev/null differ
diff --git a/docs/imgs/airflow/datahub_pipeline_entity.png b/docs/imgs/airflow/datahub_pipeline_entity.png
deleted file mode 100644
index 715baefd784ca4..00000000000000
Binary files a/docs/imgs/airflow/datahub_pipeline_entity.png and /dev/null differ
diff --git a/docs/imgs/airflow/datahub_pipeline_view.png b/docs/imgs/airflow/datahub_pipeline_view.png
deleted file mode 100644
index 5b3afd13c4ce69..00000000000000
Binary files a/docs/imgs/airflow/datahub_pipeline_view.png and /dev/null differ
diff --git a/docs/imgs/airflow/datahub_task_view.png b/docs/imgs/airflow/datahub_task_view.png
deleted file mode 100644
index 66b3487d87319d..00000000000000
Binary files a/docs/imgs/airflow/datahub_task_view.png and /dev/null differ
diff --git a/docs/imgs/airflow/entity_page_screenshot.png b/docs/imgs/airflow/entity_page_screenshot.png
deleted file mode 100644
index a782969a1f17b1..00000000000000
Binary files a/docs/imgs/airflow/entity_page_screenshot.png and /dev/null differ
diff --git a/docs/imgs/airflow/find_the_dag.png b/docs/imgs/airflow/find_the_dag.png
deleted file mode 100644
index 37cda041e4b750..00000000000000
Binary files a/docs/imgs/airflow/find_the_dag.png and /dev/null differ
diff --git a/docs/imgs/airflow/finding_failed_log.png b/docs/imgs/airflow/finding_failed_log.png
deleted file mode 100644
index 96552ba1e19839..00000000000000
Binary files a/docs/imgs/airflow/finding_failed_log.png and /dev/null differ
diff --git a/docs/imgs/airflow/paused_dag.png b/docs/imgs/airflow/paused_dag.png
deleted file mode 100644
index c314de5d38d750..00000000000000
Binary files a/docs/imgs/airflow/paused_dag.png and /dev/null differ
diff --git a/docs/imgs/airflow/successful_run.png b/docs/imgs/airflow/successful_run.png
deleted file mode 100644
index b997cc7210ff6b..00000000000000
Binary files a/docs/imgs/airflow/successful_run.png and /dev/null differ
diff --git a/docs/imgs/airflow/trigger_dag.png b/docs/imgs/airflow/trigger_dag.png
deleted file mode 100644
index a44999c929d4e2..00000000000000
Binary files a/docs/imgs/airflow/trigger_dag.png and /dev/null differ
diff --git a/docs/imgs/airflow/unpaused_dag.png b/docs/imgs/airflow/unpaused_dag.png
deleted file mode 100644
index 8462562f31d973..00000000000000
Binary files a/docs/imgs/airflow/unpaused_dag.png and /dev/null differ
diff --git a/docs/imgs/apache-ranger/datahub-platform-access-policy.png b/docs/imgs/apache-ranger/datahub-platform-access-policy.png
deleted file mode 100644
index 7e3ff6fd372a9d..00000000000000
Binary files a/docs/imgs/apache-ranger/datahub-platform-access-policy.png and /dev/null differ
diff --git a/docs/imgs/apache-ranger/datahub-plugin.png b/docs/imgs/apache-ranger/datahub-plugin.png
deleted file mode 100644
index 5dd044c0146570..00000000000000
Binary files a/docs/imgs/apache-ranger/datahub-plugin.png and /dev/null differ
diff --git a/docs/imgs/apis/postman-graphql.png b/docs/imgs/apis/postman-graphql.png
deleted file mode 100644
index 1cffd226fdf772..00000000000000
Binary files a/docs/imgs/apis/postman-graphql.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/column-description-added.png b/docs/imgs/apis/tutorials/column-description-added.png
deleted file mode 100644
index ed8cbd3bf56220..00000000000000
Binary files a/docs/imgs/apis/tutorials/column-description-added.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/column-level-lineage-added.png b/docs/imgs/apis/tutorials/column-level-lineage-added.png
deleted file mode 100644
index 6092436e0a6a83..00000000000000
Binary files a/docs/imgs/apis/tutorials/column-level-lineage-added.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/custom-properties-added.png b/docs/imgs/apis/tutorials/custom-properties-added.png
deleted file mode 100644
index a7e85d875045c9..00000000000000
Binary files a/docs/imgs/apis/tutorials/custom-properties-added.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/datahub-main-ui.png b/docs/imgs/apis/tutorials/datahub-main-ui.png
deleted file mode 100644
index b058e2683a8513..00000000000000
Binary files a/docs/imgs/apis/tutorials/datahub-main-ui.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/dataset-created.png b/docs/imgs/apis/tutorials/dataset-created.png
deleted file mode 100644
index 086dd8b7c9b16e..00000000000000
Binary files a/docs/imgs/apis/tutorials/dataset-created.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/dataset-deleted.png b/docs/imgs/apis/tutorials/dataset-deleted.png
deleted file mode 100644
index d94ad7e85195fa..00000000000000
Binary files a/docs/imgs/apis/tutorials/dataset-deleted.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/dataset-description-added.png b/docs/imgs/apis/tutorials/dataset-description-added.png
deleted file mode 100644
index 41aa9f109115b2..00000000000000
Binary files a/docs/imgs/apis/tutorials/dataset-description-added.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/dataset-properties-added-removed.png b/docs/imgs/apis/tutorials/dataset-properties-added-removed.png
deleted file mode 100644
index 9eb0284776f13c..00000000000000
Binary files a/docs/imgs/apis/tutorials/dataset-properties-added-removed.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/dataset-properties-added.png b/docs/imgs/apis/tutorials/dataset-properties-added.png
deleted file mode 100644
index e0d2acbb66eb5e..00000000000000
Binary files a/docs/imgs/apis/tutorials/dataset-properties-added.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/dataset-properties-before.png b/docs/imgs/apis/tutorials/dataset-properties-before.png
deleted file mode 100644
index b4915121a8c650..00000000000000
Binary files a/docs/imgs/apis/tutorials/dataset-properties-before.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/dataset-properties-replaced.png b/docs/imgs/apis/tutorials/dataset-properties-replaced.png
deleted file mode 100644
index 8624689c20ada4..00000000000000
Binary files a/docs/imgs/apis/tutorials/dataset-properties-replaced.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/deprecation-updated.png b/docs/imgs/apis/tutorials/deprecation-updated.png
deleted file mode 100644
index 06fedf746f694d..00000000000000
Binary files a/docs/imgs/apis/tutorials/deprecation-updated.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/domain-added.png b/docs/imgs/apis/tutorials/domain-added.png
deleted file mode 100644
index cb2002ec9ab4df..00000000000000
Binary files a/docs/imgs/apis/tutorials/domain-added.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/domain-created.png b/docs/imgs/apis/tutorials/domain-created.png
deleted file mode 100644
index cafab2a5e8d5cb..00000000000000
Binary files a/docs/imgs/apis/tutorials/domain-created.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/domain-removed.png b/docs/imgs/apis/tutorials/domain-removed.png
deleted file mode 100644
index 1b21172be11d23..00000000000000
Binary files a/docs/imgs/apis/tutorials/domain-removed.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/feature-added-to-model.png b/docs/imgs/apis/tutorials/feature-added-to-model.png
deleted file mode 100644
index 311506e4b27839..00000000000000
Binary files a/docs/imgs/apis/tutorials/feature-added-to-model.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/feature-table-created.png b/docs/imgs/apis/tutorials/feature-table-created.png
deleted file mode 100644
index 0541cbe572435f..00000000000000
Binary files a/docs/imgs/apis/tutorials/feature-table-created.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/group-upserted.png b/docs/imgs/apis/tutorials/group-upserted.png
deleted file mode 100644
index 5283f6273f02a6..00000000000000
Binary files a/docs/imgs/apis/tutorials/group-upserted.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/lineage-added.png b/docs/imgs/apis/tutorials/lineage-added.png
deleted file mode 100644
index b381498bad5ac4..00000000000000
Binary files a/docs/imgs/apis/tutorials/lineage-added.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/model-group-added-to-model.png b/docs/imgs/apis/tutorials/model-group-added-to-model.png
deleted file mode 100644
index 360b7fbb2d9220..00000000000000
Binary files a/docs/imgs/apis/tutorials/model-group-added-to-model.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/model-group-created.png b/docs/imgs/apis/tutorials/model-group-created.png
deleted file mode 100644
index 2e0fdcea4803f8..00000000000000
Binary files a/docs/imgs/apis/tutorials/model-group-created.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/owner-added.png b/docs/imgs/apis/tutorials/owner-added.png
deleted file mode 100644
index 6508c231cfb4ba..00000000000000
Binary files a/docs/imgs/apis/tutorials/owner-added.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/owner-removed.png b/docs/imgs/apis/tutorials/owner-removed.png
deleted file mode 100644
index a7b6567888caf0..00000000000000
Binary files a/docs/imgs/apis/tutorials/owner-removed.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/sample-ingestion.png b/docs/imgs/apis/tutorials/sample-ingestion.png
deleted file mode 100644
index 40aa0469048417..00000000000000
Binary files a/docs/imgs/apis/tutorials/sample-ingestion.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/tag-added.png b/docs/imgs/apis/tutorials/tag-added.png
deleted file mode 100644
index fd99a04f6cceba..00000000000000
Binary files a/docs/imgs/apis/tutorials/tag-added.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/tag-created.png b/docs/imgs/apis/tutorials/tag-created.png
deleted file mode 100644
index 99e3fea8a14e16..00000000000000
Binary files a/docs/imgs/apis/tutorials/tag-created.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/tag-removed.png b/docs/imgs/apis/tutorials/tag-removed.png
deleted file mode 100644
index 31a267549843e5..00000000000000
Binary files a/docs/imgs/apis/tutorials/tag-removed.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/term-added.png b/docs/imgs/apis/tutorials/term-added.png
deleted file mode 100644
index 62e285a92e7af0..00000000000000
Binary files a/docs/imgs/apis/tutorials/term-added.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/term-created.png b/docs/imgs/apis/tutorials/term-created.png
deleted file mode 100644
index deff0179b155ee..00000000000000
Binary files a/docs/imgs/apis/tutorials/term-created.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/term-removed.png b/docs/imgs/apis/tutorials/term-removed.png
deleted file mode 100644
index dbf9f35f093399..00000000000000
Binary files a/docs/imgs/apis/tutorials/term-removed.png and /dev/null differ
diff --git a/docs/imgs/apis/tutorials/user-upserted.png b/docs/imgs/apis/tutorials/user-upserted.png
deleted file mode 100644
index 38c5bbb9ad8280..00000000000000
Binary files a/docs/imgs/apis/tutorials/user-upserted.png and /dev/null differ
diff --git a/docs/imgs/aws/aws-elasticsearch.png b/docs/imgs/aws/aws-elasticsearch.png
deleted file mode 100644
index e16d5eee26fd85..00000000000000
Binary files a/docs/imgs/aws/aws-elasticsearch.png and /dev/null differ
diff --git a/docs/imgs/aws/aws-msk.png b/docs/imgs/aws/aws-msk.png
deleted file mode 100644
index 96a3173747007e..00000000000000
Binary files a/docs/imgs/aws/aws-msk.png and /dev/null differ
diff --git a/docs/imgs/aws/aws-rds.png b/docs/imgs/aws/aws-rds.png
deleted file mode 100644
index ab329952c77560..00000000000000
Binary files a/docs/imgs/aws/aws-rds.png and /dev/null differ
diff --git a/docs/imgs/browse-domains.png b/docs/imgs/browse-domains.png
deleted file mode 100644
index 41444470517d2a..00000000000000
Binary files a/docs/imgs/browse-domains.png and /dev/null differ
diff --git a/docs/imgs/cancelled-ingestion.png b/docs/imgs/cancelled-ingestion.png
deleted file mode 100644
index 0c4af7b66a8ff2..00000000000000
Binary files a/docs/imgs/cancelled-ingestion.png and /dev/null differ
diff --git a/docs/imgs/confluent-cloud-config-2.png b/docs/imgs/confluent-cloud-config-2.png
deleted file mode 100644
index 543101154f42cf..00000000000000
Binary files a/docs/imgs/confluent-cloud-config-2.png and /dev/null differ
diff --git a/docs/imgs/confluent-cloud-config.png b/docs/imgs/confluent-cloud-config.png
deleted file mode 100644
index a2490eab5c6a77..00000000000000
Binary files a/docs/imgs/confluent-cloud-config.png and /dev/null differ
diff --git a/docs/imgs/confluent-create-topic.png b/docs/imgs/confluent-create-topic.png
deleted file mode 100644
index 1972bb3770388f..00000000000000
Binary files a/docs/imgs/confluent-create-topic.png and /dev/null differ
diff --git a/docs/imgs/create-domain.png b/docs/imgs/create-domain.png
deleted file mode 100644
index 1db2090fca6b89..00000000000000
Binary files a/docs/imgs/create-domain.png and /dev/null differ
diff --git a/docs/imgs/create-new-ingestion-source-button.png b/docs/imgs/create-new-ingestion-source-button.png
deleted file mode 100644
index c425f0837c51d3..00000000000000
Binary files a/docs/imgs/create-new-ingestion-source-button.png and /dev/null differ
diff --git a/docs/imgs/create-secret.png b/docs/imgs/create-secret.png
deleted file mode 100644
index a0cc63e3b4892f..00000000000000
Binary files a/docs/imgs/create-secret.png and /dev/null differ
diff --git a/docs/imgs/custom-ingestion-cli-version.png b/docs/imgs/custom-ingestion-cli-version.png
deleted file mode 100644
index 43d4736684abb1..00000000000000
Binary files a/docs/imgs/custom-ingestion-cli-version.png and /dev/null differ
diff --git a/docs/imgs/datahub-architecture.png b/docs/imgs/datahub-architecture.png
deleted file mode 100644
index 236f939f74198b..00000000000000
Binary files a/docs/imgs/datahub-architecture.png and /dev/null differ
diff --git a/docs/imgs/datahub-architecture.svg b/docs/imgs/datahub-architecture.svg
deleted file mode 100644
index 842194a5e377ce..00000000000000
--- a/docs/imgs/datahub-architecture.svg
+++ /dev/null
@@ -1 +0,0 @@
-
\ No newline at end of file
diff --git a/docs/imgs/datahub-components.png b/docs/imgs/datahub-components.png
deleted file mode 100644
index 8b7d0e5330275a..00000000000000
Binary files a/docs/imgs/datahub-components.png and /dev/null differ
diff --git a/docs/imgs/datahub-logo-color-mark.svg b/docs/imgs/datahub-logo-color-mark.svg
deleted file mode 100644
index a984092952bae2..00000000000000
--- a/docs/imgs/datahub-logo-color-mark.svg
+++ /dev/null
@@ -1 +0,0 @@
-
\ No newline at end of file
diff --git a/docs/imgs/datahub-metadata-ingestion-framework.png b/docs/imgs/datahub-metadata-ingestion-framework.png
deleted file mode 100644
index 1319329710906d..00000000000000
Binary files a/docs/imgs/datahub-metadata-ingestion-framework.png and /dev/null differ
diff --git a/docs/imgs/datahub-metadata-model.png b/docs/imgs/datahub-metadata-model.png
deleted file mode 100644
index 59449cd0d4ef59..00000000000000
Binary files a/docs/imgs/datahub-metadata-model.png and /dev/null differ
diff --git a/docs/imgs/datahub-sequence-diagram.png b/docs/imgs/datahub-sequence-diagram.png
deleted file mode 100644
index b5a8f8a9c25ce2..00000000000000
Binary files a/docs/imgs/datahub-sequence-diagram.png and /dev/null differ
diff --git a/docs/imgs/datahub-serving.png b/docs/imgs/datahub-serving.png
deleted file mode 100644
index 67a2f8eb3f0856..00000000000000
Binary files a/docs/imgs/datahub-serving.png and /dev/null differ
diff --git a/docs/imgs/development/intellij-remote-debug.png b/docs/imgs/development/intellij-remote-debug.png
deleted file mode 100644
index 32a41a75d1dc38..00000000000000
Binary files a/docs/imgs/development/intellij-remote-debug.png and /dev/null differ
diff --git a/docs/imgs/domain-entities.png b/docs/imgs/domain-entities.png
deleted file mode 100644
index 5766d051fa209f..00000000000000
Binary files a/docs/imgs/domain-entities.png and /dev/null differ
diff --git a/docs/imgs/domains-tab.png b/docs/imgs/domains-tab.png
deleted file mode 100644
index 20be5b103fdcaa..00000000000000
Binary files a/docs/imgs/domains-tab.png and /dev/null differ
diff --git a/docs/imgs/entity-registry-diagram.png b/docs/imgs/entity-registry-diagram.png
deleted file mode 100644
index 08cb5edd8e13f2..00000000000000
Binary files a/docs/imgs/entity-registry-diagram.png and /dev/null differ
diff --git a/docs/imgs/entity.png b/docs/imgs/entity.png
deleted file mode 100644
index cfe9eb38b2921e..00000000000000
Binary files a/docs/imgs/entity.png and /dev/null differ
diff --git a/docs/imgs/example-mysql-recipe.png b/docs/imgs/example-mysql-recipe.png
deleted file mode 100644
index 9cb2cbb169a569..00000000000000
Binary files a/docs/imgs/example-mysql-recipe.png and /dev/null differ
diff --git a/docs/imgs/failed-ingestion.png b/docs/imgs/failed-ingestion.png
deleted file mode 100644
index 4f9de8eb002d2f..00000000000000
Binary files a/docs/imgs/failed-ingestion.png and /dev/null differ
diff --git a/docs/imgs/feature-create-new-tag.gif b/docs/imgs/feature-create-new-tag.gif
deleted file mode 100644
index 57b8ad852dd5b2..00000000000000
Binary files a/docs/imgs/feature-create-new-tag.gif and /dev/null differ
diff --git a/docs/imgs/feature-datahub-analytics.png b/docs/imgs/feature-datahub-analytics.png
deleted file mode 100644
index 7fe66b84682f9a..00000000000000
Binary files a/docs/imgs/feature-datahub-analytics.png and /dev/null differ
diff --git a/docs/imgs/feature-rich-documentation.gif b/docs/imgs/feature-rich-documentation.gif
deleted file mode 100644
index 48ad7956700226..00000000000000
Binary files a/docs/imgs/feature-rich-documentation.gif and /dev/null differ
diff --git a/docs/imgs/feature-tag-browse.gif b/docs/imgs/feature-tag-browse.gif
deleted file mode 100644
index e70a30db7d3ba9..00000000000000
Binary files a/docs/imgs/feature-tag-browse.gif and /dev/null differ
diff --git a/docs/imgs/feature-validation-timeseries.png b/docs/imgs/feature-validation-timeseries.png
deleted file mode 100644
index 28ce1daec5f32e..00000000000000
Binary files a/docs/imgs/feature-validation-timeseries.png and /dev/null differ
diff --git a/docs/imgs/feature-view-entitiy-details-via-lineage-vis.gif b/docs/imgs/feature-view-entitiy-details-via-lineage-vis.gif
deleted file mode 100644
index aad77df3735747..00000000000000
Binary files a/docs/imgs/feature-view-entitiy-details-via-lineage-vis.gif and /dev/null differ
diff --git a/docs/imgs/gcp/ingress1.png b/docs/imgs/gcp/ingress1.png
deleted file mode 100644
index 4cb49834af5b60..00000000000000
Binary files a/docs/imgs/gcp/ingress1.png and /dev/null differ
diff --git a/docs/imgs/gcp/ingress2.png b/docs/imgs/gcp/ingress2.png
deleted file mode 100644
index cdf2446b0e923b..00000000000000
Binary files a/docs/imgs/gcp/ingress2.png and /dev/null differ
diff --git a/docs/imgs/gcp/ingress3.png b/docs/imgs/gcp/ingress3.png
deleted file mode 100644
index cc3745ad97f5bd..00000000000000
Binary files a/docs/imgs/gcp/ingress3.png and /dev/null differ
diff --git a/docs/imgs/gcp/ingress_final.png b/docs/imgs/gcp/ingress_final.png
deleted file mode 100644
index a30ca744c49f76..00000000000000
Binary files a/docs/imgs/gcp/ingress_final.png and /dev/null differ
diff --git a/docs/imgs/gcp/ingress_ready.png b/docs/imgs/gcp/ingress_ready.png
deleted file mode 100644
index d14016e420fd3d..00000000000000
Binary files a/docs/imgs/gcp/ingress_ready.png and /dev/null differ
diff --git a/docs/imgs/gcp/services_ingress.png b/docs/imgs/gcp/services_ingress.png
deleted file mode 100644
index 1d9ff2b313715c..00000000000000
Binary files a/docs/imgs/gcp/services_ingress.png and /dev/null differ
diff --git a/docs/imgs/glossary/add-term-modal.png b/docs/imgs/glossary/add-term-modal.png
deleted file mode 100644
index e32a9cb8d648c6..00000000000000
Binary files a/docs/imgs/glossary/add-term-modal.png and /dev/null differ
diff --git a/docs/imgs/glossary/add-term-to-entity.png b/docs/imgs/glossary/add-term-to-entity.png
deleted file mode 100644
index 7487a68c0d7559..00000000000000
Binary files a/docs/imgs/glossary/add-term-to-entity.png and /dev/null differ
diff --git a/docs/imgs/glossary/create-from-node.png b/docs/imgs/glossary/create-from-node.png
deleted file mode 100644
index 70638d083343c2..00000000000000
Binary files a/docs/imgs/glossary/create-from-node.png and /dev/null differ
diff --git a/docs/imgs/glossary/create-modal.png b/docs/imgs/glossary/create-modal.png
deleted file mode 100644
index e84fb5a36e2d40..00000000000000
Binary files a/docs/imgs/glossary/create-modal.png and /dev/null differ
diff --git a/docs/imgs/glossary/delete-button.png b/docs/imgs/glossary/delete-button.png
deleted file mode 100644
index 3e0cc2a5b0a54a..00000000000000
Binary files a/docs/imgs/glossary/delete-button.png and /dev/null differ
diff --git a/docs/imgs/glossary/edit-term.png b/docs/imgs/glossary/edit-term.png
deleted file mode 100644
index 62b0e425c8c4f3..00000000000000
Binary files a/docs/imgs/glossary/edit-term.png and /dev/null differ
diff --git a/docs/imgs/glossary/glossary-button.png b/docs/imgs/glossary/glossary-button.png
deleted file mode 100644
index e4b8fd23935877..00000000000000
Binary files a/docs/imgs/glossary/glossary-button.png and /dev/null differ
diff --git a/docs/imgs/glossary/move-term-button.png b/docs/imgs/glossary/move-term-button.png
deleted file mode 100644
index df03c820340eff..00000000000000
Binary files a/docs/imgs/glossary/move-term-button.png and /dev/null differ
diff --git a/docs/imgs/glossary/move-term-modal.png b/docs/imgs/glossary/move-term-modal.png
deleted file mode 100644
index 0fda501911b2b0..00000000000000
Binary files a/docs/imgs/glossary/move-term-modal.png and /dev/null differ
diff --git a/docs/imgs/glossary/root-glossary-create.png b/docs/imgs/glossary/root-glossary-create.png
deleted file mode 100644
index c91f397eb6213c..00000000000000
Binary files a/docs/imgs/glossary/root-glossary-create.png and /dev/null differ
diff --git a/docs/imgs/glossary/root-glossary.png b/docs/imgs/glossary/root-glossary.png
deleted file mode 100644
index 1296c16b0dc3d1..00000000000000
Binary files a/docs/imgs/glossary/root-glossary.png and /dev/null differ
diff --git a/docs/imgs/ingestion-architecture.png b/docs/imgs/ingestion-architecture.png
deleted file mode 100644
index fc7bc74acacfaf..00000000000000
Binary files a/docs/imgs/ingestion-architecture.png and /dev/null differ
diff --git a/docs/imgs/ingestion-logs.png b/docs/imgs/ingestion-logs.png
deleted file mode 100644
index 42211be7379d6e..00000000000000
Binary files a/docs/imgs/ingestion-logs.png and /dev/null differ
diff --git a/docs/imgs/ingestion-privileges.png b/docs/imgs/ingestion-privileges.png
deleted file mode 100644
index 8e23868309676c..00000000000000
Binary files a/docs/imgs/ingestion-privileges.png and /dev/null differ
diff --git a/docs/imgs/ingestion-tab.png b/docs/imgs/ingestion-tab.png
deleted file mode 100644
index 046068c63bdb7b..00000000000000
Binary files a/docs/imgs/ingestion-tab.png and /dev/null differ
diff --git a/docs/imgs/ingestion-with-token.png b/docs/imgs/ingestion-with-token.png
deleted file mode 100644
index 5e1a2cce036f7a..00000000000000
Binary files a/docs/imgs/ingestion-with-token.png and /dev/null differ
diff --git a/docs/imgs/invite-users-button.png b/docs/imgs/invite-users-button.png
deleted file mode 100644
index a5d07a1c1e7e75..00000000000000
Binary files a/docs/imgs/invite-users-button.png and /dev/null differ
diff --git a/docs/imgs/invite-users-popup.png b/docs/imgs/invite-users-popup.png
deleted file mode 100644
index 621b1521eae752..00000000000000
Binary files a/docs/imgs/invite-users-popup.png and /dev/null differ
diff --git a/docs/imgs/lineage.png b/docs/imgs/lineage.png
deleted file mode 100644
index 7488c1e04c31b2..00000000000000
Binary files a/docs/imgs/lineage.png and /dev/null differ
diff --git a/docs/imgs/list-domains.png b/docs/imgs/list-domains.png
deleted file mode 100644
index 98a28130f8c990..00000000000000
Binary files a/docs/imgs/list-domains.png and /dev/null differ
diff --git a/docs/imgs/locust-example.png b/docs/imgs/locust-example.png
deleted file mode 100644
index bbae3e0ca19d07..00000000000000
Binary files a/docs/imgs/locust-example.png and /dev/null differ
diff --git a/docs/imgs/metadata-model-chart.png b/docs/imgs/metadata-model-chart.png
deleted file mode 100644
index 2fb74836549063..00000000000000
Binary files a/docs/imgs/metadata-model-chart.png and /dev/null differ
diff --git a/docs/imgs/metadata-model-to-fork-or-not-to.png b/docs/imgs/metadata-model-to-fork-or-not-to.png
deleted file mode 100644
index f9d89d555196d1..00000000000000
Binary files a/docs/imgs/metadata-model-to-fork-or-not-to.png and /dev/null differ
diff --git a/docs/imgs/metadata-modeling.png b/docs/imgs/metadata-modeling.png
deleted file mode 100644
index cbad7613e04e43..00000000000000
Binary files a/docs/imgs/metadata-modeling.png and /dev/null differ
diff --git a/docs/imgs/metadata-service-auth.png b/docs/imgs/metadata-service-auth.png
deleted file mode 100644
index 15a3ac51876c23..00000000000000
Binary files a/docs/imgs/metadata-service-auth.png and /dev/null differ
diff --git a/docs/imgs/metadata-serving.png b/docs/imgs/metadata-serving.png
deleted file mode 100644
index 54b928a0cff52e..00000000000000
Binary files a/docs/imgs/metadata-serving.png and /dev/null differ
diff --git a/docs/imgs/metadata.png b/docs/imgs/metadata.png
deleted file mode 100644
index 45bb0cdce12e95..00000000000000
Binary files a/docs/imgs/metadata.png and /dev/null differ
diff --git a/docs/imgs/name-ingestion-source.png b/docs/imgs/name-ingestion-source.png
deleted file mode 100644
index bde12082484738..00000000000000
Binary files a/docs/imgs/name-ingestion-source.png and /dev/null differ
diff --git a/docs/imgs/no-code-after.png b/docs/imgs/no-code-after.png
deleted file mode 100644
index c0eee88625ace9..00000000000000
Binary files a/docs/imgs/no-code-after.png and /dev/null differ
diff --git a/docs/imgs/no-code-before.png b/docs/imgs/no-code-before.png
deleted file mode 100644
index 50315578b1804a..00000000000000
Binary files a/docs/imgs/no-code-before.png and /dev/null differ
diff --git a/docs/imgs/platform-instances-for-ingestion.png b/docs/imgs/platform-instances-for-ingestion.png
deleted file mode 100644
index 740249a805fb85..00000000000000
Binary files a/docs/imgs/platform-instances-for-ingestion.png and /dev/null differ
diff --git a/docs/imgs/quickstart-ingestion-config.png b/docs/imgs/quickstart-ingestion-config.png
deleted file mode 100644
index de51777ccddc3a..00000000000000
Binary files a/docs/imgs/quickstart-ingestion-config.png and /dev/null differ
diff --git a/docs/imgs/reset-credentials-screen.png b/docs/imgs/reset-credentials-screen.png
deleted file mode 100644
index 4b680837b77ab1..00000000000000
Binary files a/docs/imgs/reset-credentials-screen.png and /dev/null differ
diff --git a/docs/imgs/reset-user-password-button.png b/docs/imgs/reset-user-password-button.png
deleted file mode 100644
index 5b1f3ee153d072..00000000000000
Binary files a/docs/imgs/reset-user-password-button.png and /dev/null differ
diff --git a/docs/imgs/reset-user-password-popup.png b/docs/imgs/reset-user-password-popup.png
deleted file mode 100644
index ac2456dde4d4d3..00000000000000
Binary files a/docs/imgs/reset-user-password-popup.png and /dev/null differ
diff --git a/docs/imgs/running-ingestion.png b/docs/imgs/running-ingestion.png
deleted file mode 100644
index a03fb444a029ed..00000000000000
Binary files a/docs/imgs/running-ingestion.png and /dev/null differ
diff --git a/docs/imgs/s3-ingestion/10_outputs.png b/docs/imgs/s3-ingestion/10_outputs.png
deleted file mode 100644
index e0d1ed3376ade9..00000000000000
Binary files a/docs/imgs/s3-ingestion/10_outputs.png and /dev/null differ
diff --git a/docs/imgs/s3-ingestion/1_crawler-info.png b/docs/imgs/s3-ingestion/1_crawler-info.png
deleted file mode 100644
index 12882473920479..00000000000000
Binary files a/docs/imgs/s3-ingestion/1_crawler-info.png and /dev/null differ
diff --git a/docs/imgs/s3-ingestion/2_crawler-type.png b/docs/imgs/s3-ingestion/2_crawler-type.png
deleted file mode 100644
index 4898438417913c..00000000000000
Binary files a/docs/imgs/s3-ingestion/2_crawler-type.png and /dev/null differ
diff --git a/docs/imgs/s3-ingestion/3_data-store.png b/docs/imgs/s3-ingestion/3_data-store.png
deleted file mode 100644
index d29e4b1be05d65..00000000000000
Binary files a/docs/imgs/s3-ingestion/3_data-store.png and /dev/null differ
diff --git a/docs/imgs/s3-ingestion/4_data-store-2.png b/docs/imgs/s3-ingestion/4_data-store-2.png
deleted file mode 100644
index c0a6f140bedb22..00000000000000
Binary files a/docs/imgs/s3-ingestion/4_data-store-2.png and /dev/null differ
diff --git a/docs/imgs/s3-ingestion/5_iam.png b/docs/imgs/s3-ingestion/5_iam.png
deleted file mode 100644
index 73a631cb74f560..00000000000000
Binary files a/docs/imgs/s3-ingestion/5_iam.png and /dev/null differ
diff --git a/docs/imgs/s3-ingestion/6_schedule.png b/docs/imgs/s3-ingestion/6_schedule.png
deleted file mode 100644
index c5df59348fbc69..00000000000000
Binary files a/docs/imgs/s3-ingestion/6_schedule.png and /dev/null differ
diff --git a/docs/imgs/s3-ingestion/7_output.png b/docs/imgs/s3-ingestion/7_output.png
deleted file mode 100644
index 6201fa40bcfb33..00000000000000
Binary files a/docs/imgs/s3-ingestion/7_output.png and /dev/null differ
diff --git a/docs/imgs/s3-ingestion/8_review.png b/docs/imgs/s3-ingestion/8_review.png
deleted file mode 100644
index 2d27e79c2128b8..00000000000000
Binary files a/docs/imgs/s3-ingestion/8_review.png and /dev/null differ
diff --git a/docs/imgs/s3-ingestion/9_run.png b/docs/imgs/s3-ingestion/9_run.png
deleted file mode 100644
index 2b0644f6ad0384..00000000000000
Binary files a/docs/imgs/s3-ingestion/9_run.png and /dev/null differ
diff --git a/docs/imgs/schedule-ingestion.png b/docs/imgs/schedule-ingestion.png
deleted file mode 100644
index 0e6ec8e268c32a..00000000000000
Binary files a/docs/imgs/schedule-ingestion.png and /dev/null differ
diff --git a/docs/imgs/schema-blame-blame-activated.png b/docs/imgs/schema-blame-blame-activated.png
deleted file mode 100644
index 363466c39aedfb..00000000000000
Binary files a/docs/imgs/schema-blame-blame-activated.png and /dev/null differ
diff --git a/docs/imgs/schema-history-audit-activated.png b/docs/imgs/schema-history-audit-activated.png
deleted file mode 100644
index f59676b9b8a8fd..00000000000000
Binary files a/docs/imgs/schema-history-audit-activated.png and /dev/null differ
diff --git a/docs/imgs/schema-history-latest-version.png b/docs/imgs/schema-history-latest-version.png
deleted file mode 100644
index 0a54df4d520d53..00000000000000
Binary files a/docs/imgs/schema-history-latest-version.png and /dev/null differ
diff --git a/docs/imgs/schema-history-older-version.png b/docs/imgs/schema-history-older-version.png
deleted file mode 100644
index 8d295f176104f7..00000000000000
Binary files a/docs/imgs/schema-history-older-version.png and /dev/null differ
diff --git a/docs/imgs/search-by-domain.png b/docs/imgs/search-by-domain.png
deleted file mode 100644
index 4b92e589591877..00000000000000
Binary files a/docs/imgs/search-by-domain.png and /dev/null differ
diff --git a/docs/imgs/search-domain.png b/docs/imgs/search-domain.png
deleted file mode 100644
index b1359e07d5fc21..00000000000000
Binary files a/docs/imgs/search-domain.png and /dev/null differ
diff --git a/docs/imgs/search-tag.png b/docs/imgs/search-tag.png
deleted file mode 100644
index cf4b6b629d1e23..00000000000000
Binary files a/docs/imgs/search-tag.png and /dev/null differ
diff --git a/docs/imgs/select-platform-template.png b/docs/imgs/select-platform-template.png
deleted file mode 100644
index 4f78e2b7309edc..00000000000000
Binary files a/docs/imgs/select-platform-template.png and /dev/null differ
diff --git a/docs/imgs/set-domain-id.png b/docs/imgs/set-domain-id.png
deleted file mode 100644
index 3e1dde4ae51ee1..00000000000000
Binary files a/docs/imgs/set-domain-id.png and /dev/null differ
diff --git a/docs/imgs/set-domain.png b/docs/imgs/set-domain.png
deleted file mode 100644
index 1c4460e747835d..00000000000000
Binary files a/docs/imgs/set-domain.png and /dev/null differ
diff --git a/docs/imgs/successful-ingestion.png b/docs/imgs/successful-ingestion.png
deleted file mode 100644
index fa8dbdff7501ed..00000000000000
Binary files a/docs/imgs/successful-ingestion.png and /dev/null differ
diff --git a/docs/imgs/timeline/dropdown-apis.png b/docs/imgs/timeline/dropdown-apis.png
deleted file mode 100644
index f7aba08bbc061f..00000000000000
Binary files a/docs/imgs/timeline/dropdown-apis.png and /dev/null differ
diff --git a/docs/imgs/timeline/swagger-ui.png b/docs/imgs/timeline/swagger-ui.png
deleted file mode 100644
index e52a57e8ca6706..00000000000000
Binary files a/docs/imgs/timeline/swagger-ui.png and /dev/null differ
diff --git a/docs/imgs/timeline/timeline-conceptually.png b/docs/imgs/timeline/timeline-conceptually.png
deleted file mode 100644
index 70bd843bf8aed7..00000000000000
Binary files a/docs/imgs/timeline/timeline-conceptually.png and /dev/null differ
diff --git a/docs/imgs/user-sign-up-screen.png b/docs/imgs/user-sign-up-screen.png
deleted file mode 100644
index 88c2589203bd18..00000000000000
Binary files a/docs/imgs/user-sign-up-screen.png and /dev/null differ
diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md
index ef4071f89c5855..21d59b777dd7c6 100644
--- a/docs/lineage/airflow.md
+++ b/docs/lineage/airflow.md
@@ -62,6 +62,7 @@ lazy_load_plugins = False
| datahub.cluster | prod | name of the airflow cluster |
| datahub.capture_ownership_info | true | If true, the owners field of the DAG will be capture as a DataHub corpuser. |
| datahub.capture_tags_info | true | If true, the tags field of the DAG will be captured as DataHub tags. |
+ | datahub.capture_executions | true | If true, we'll capture task runs in DataHub in addition to DAG definitions. |
| datahub.graceful_exceptions | true | If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. |
5. Configure `inlets` and `outlets` for your Airflow operators. For reference, look at the sample DAG in [`lineage_backend_demo.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_demo.py), or reference [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_taskflow_demo.py) if you're using the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html).
@@ -80,9 +81,7 @@ Emitting DataHub ...
If you have created a custom Airflow operator [docs](https://airflow.apache.org/docs/apache-airflow/stable/howto/custom-operator.html) that inherits from the BaseOperator class,
when overriding the `execute` function, set inlets and outlets via `context['ti'].task.inlets` and `context['ti'].task.outlets`.
-The DataHub Airflow plugin will then pick up those inlets and outlets after the task runs.
-
-
+The DataHub Airflow plugin will then pick up those inlets and outlets after the task runs.
```python
class DbtOperator(BaseOperator):
@@ -97,8 +96,8 @@ class DbtOperator(BaseOperator):
def _get_lineage(self):
# Do some processing to get inlets/outlets
-
- return inlets, outlets
+
+ return inlets, outlets
```
If you override the `pre_execute` and `post_execute` function, ensure they include the `@prepare_lineage` and `@apply_lineage` decorators respectively. [source](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/lineage.html#lineage)
@@ -172,7 +171,6 @@ Take a look at this sample DAG:
In order to use this example, you must first configure the Datahub hook. Like in ingestion, we support a Datahub REST hook and a Kafka-based hook. See step 1 above for details.
-
## Debugging
### Incorrect URLs
diff --git a/docs/links.md b/docs/links.md
index f175262b9b5d93..45ba391e557cdb 100644
--- a/docs/links.md
+++ b/docs/links.md
@@ -39,7 +39,7 @@
* [Creating Notebook-based Dynamic Dashboards](https://towardsdatascience.com/creating-notebook-based-dynamic-dashboards-91f936adc6f3)
## Talks & Presentations
-* [DataHub: Powering LinkedIn's Metadata](demo/DataHub_-_Powering_LinkedIn_Metadata.pdf) @ [Budapest Data Forum 2020](https://budapestdata.hu/2020/en/)
+* [DataHub: Powering LinkedIn's Metadata](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/DataHub_-_Powering_LinkedIn_Metadata.pdf) @ [Budapest Data Forum 2020](https://budapestdata.hu/2020/en/)
* [Taming the Data Beast Using DataHub](https://www.youtube.com/watch?v=bo4OhiPro7Y) @ [Data Engineering Melbourne Meetup November 2020](https://www.meetup.com/Data-Engineering-Melbourne/events/kgnvlrybcpbjc/)
* [Metadata Management And Integration At LinkedIn With DataHub](https://www.dataengineeringpodcast.com/datahub-metadata-management-episode-147/) @ [Data Engineering Podcast](https://www.dataengineeringpodcast.com)
* [The evolution of metadata: LinkedIn’s story](https://speakerdeck.com/shirshanka/the-evolution-of-metadata-linkedins-journey-strata-nyc-2019) @ [Strata Data Conference 2019](https://conferences.oreilly.com/strata/strata-ny-2019.html)
diff --git a/docs/managed-datahub/chrome-extension.md b/docs/managed-datahub/chrome-extension.md
index a614327c7fd29d..c6840f4e8e221d 100644
--- a/docs/managed-datahub/chrome-extension.md
+++ b/docs/managed-datahub/chrome-extension.md
@@ -10,7 +10,11 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability';
In order to use the Acryl DataHub Chrome extension, you need to download it onto your browser from the Chrome web store [here](https://chrome.google.com/webstore/detail/datahub-chrome-extension/aoenebhmfokhglijmoacfjcnebdpchfj).
-![](imgs/saas/chrome-store-extension-screenshot.png)
+
+
+
+
+
Simply click "Add to Chrome" then "Add extension" on the ensuing popup.
@@ -20,11 +24,19 @@ Once you have your extension installed, you'll need to configure it to work with
1. Click the extension button on the right of your browser's address bar to view all of your installed extensions. Click on the newly installed DataHub extension.
-![](imgs/saas/extension_open_popup.png)
+
+
+
+
+
2. Fill in your DataHub domain and click "Continue" in the extension popup that appears.
-![](imgs/saas/extension_enter_domain.png)
+
+
+
+
+
If your organization uses standard SaaS domains for Looker, you should be ready to go!
@@ -34,11 +46,19 @@ Some organizations have custom SaaS domains for Looker and some Acryl DataHub de
1. Click on the extension button and select your DataHub extension to open the popup again. Now click the settings icon in order to open the configurations page.
-![](imgs/saas/extension_open_options_page.png)
+
+
+
+
+
2. Fill out any and save custom configurations you have in the **TOOL CONFIGURATIONS** section. Here you can configure a custom domain, a Platform Instance associated with that domain, and the Environment set on your DataHub assets. If you don't have a custom domain but do have a custom Platform Instance or Environment, feel free to leave the field domain empty.
-![](imgs/saas/extension_custom_configs.png)
+
+
+
+
+
## Using the Extension
@@ -52,7 +72,11 @@ Once you have everything configured on your extension, it's time to use it!
4. Click the Acryl DataHub extension button on the bottom right of your page to open a drawer where you can now see additional information about this asset right from your DataHub instance.
-![](imgs/saas/extension_view_in_looker.png)
+
+
+
+
+
## Advanced: Self-Hosted DataHub
diff --git a/docs/managed-datahub/datahub-api/graphql-api/getting-started.md b/docs/managed-datahub/datahub-api/graphql-api/getting-started.md
index 3c57b0a21d96e4..57d46f05c4e0c2 100644
--- a/docs/managed-datahub/datahub-api/graphql-api/getting-started.md
+++ b/docs/managed-datahub/datahub-api/graphql-api/getting-started.md
@@ -10,7 +10,11 @@ For a full reference to the Queries & Mutations available for consumption, check
### Connecting to the API
-![](../../imgs/saas/image-(3).png)
+
+
+
+
+
When you generate the token you will see an example of `curl` command which you can use to connect to the GraphQL API.
diff --git a/docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta.md b/docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta.md
index 89bacb2009e494..bfd8e8f2dae1bc 100644
--- a/docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta.md
+++ b/docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta.md
@@ -404,7 +404,11 @@ You can configure Acryl to send slack notifications to a specific channel when i
These notifications are also able to tag the immediate asset's owners, along with the owners of downstream assets consuming it.
-![](../../imgs/saas/Screen-Shot-2022-03-22-at-6.46.41-PM.png)
+
+
+
+
+
To do so, simply follow the [Slack Integration Guide](docs/managed-datahub/saas-slack-setup.md) and contact your Acryl customer success team to enable the feature!
diff --git a/docs/managed-datahub/imgs/saas/DataHub-Architecture.png b/docs/managed-datahub/imgs/saas/DataHub-Architecture.png
deleted file mode 100644
index 95b3ab0b06ad64..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/DataHub-Architecture.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-13-at-7.45.56-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-13-at-7.45.56-PM.png
deleted file mode 100644
index 721989a6c37e11..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-13-at-7.45.56-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.35.17-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.35.17-PM.png
deleted file mode 100644
index dffac92f257c7b..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.35.17-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.37.22-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.37.22-PM.png
deleted file mode 100644
index ff0c29de1fbad3..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-01-24-at-4.37.22-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-07-at-10.23.31-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-07-at-10.23.31-AM.png
deleted file mode 100644
index 070bfd9f6b8975..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-07-at-10.23.31-AM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.43.25-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.43.25-PM.png
deleted file mode 100644
index b4bb4e2ba60edc..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.43.25-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.44.15-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.44.15-PM.png
deleted file mode 100644
index b0397afd1b3a40..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.44.15-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.46.41-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.46.41-PM.png
deleted file mode 100644
index 9258badb6f0889..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-03-22-at-6.46.41-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.52.55-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.52.55-PM.png
deleted file mode 100644
index 386b4cdcd99113..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.52.55-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.56.50-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.56.50-PM.png
deleted file mode 100644
index a129f5eba4271b..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.56.50-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.58.46-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.58.46-PM.png
deleted file mode 100644
index 96ae48318a35a1..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-4.58.46-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.01.16-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.01.16-PM.png
deleted file mode 100644
index b6fd273389c904..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.01.16-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.03.36-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.03.36-PM.png
deleted file mode 100644
index 0acd4e75bc6d2c..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-05-at-5.03.36-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-13-at-2.34.24-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-13-at-2.34.24-PM.png
deleted file mode 100644
index 364b9292cfaab7..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-04-13-at-2.34.24-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM-(1).png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM-(1).png
deleted file mode 100644
index 6a12dc545ec62c..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM-(1).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM.png
deleted file mode 100644
index 6a12dc545ec62c..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-7.56.16-AM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-8.02.55-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-8.02.55-AM.png
deleted file mode 100644
index 83645e00d724a4..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-13-at-8.02.55-AM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-11.02.47-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-11.02.47-AM.png
deleted file mode 100644
index a2f239ce847e07..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-11.02.47-AM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-12.59.38-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-12.59.38-PM.png
deleted file mode 100644
index e31d4b089d9292..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-06-24-at-12.59.38-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.21.42-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.21.42-AM.png
deleted file mode 100644
index c003581c9d1b63..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.21.42-AM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.22.23-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.22.23-AM.png
deleted file mode 100644
index 660dd121dd0a41..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.22.23-AM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.23.08-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.23.08-AM.png
deleted file mode 100644
index 07e3c71dba262f..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.23.08-AM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.47.57-AM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.47.57-AM.png
deleted file mode 100644
index 579e7f62af7085..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-22-at-11.47.57-AM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM-(1).png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM-(1).png
deleted file mode 100644
index f85f4d5c79bfb9..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM-(1).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM.png
deleted file mode 100644
index f85f4d5c79bfb9..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2022-08-29-at-6.07.25-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.16.52-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.16.52-PM.png
deleted file mode 100644
index cb8b7470cd957d..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.16.52-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.23.32-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.23.32-PM.png
deleted file mode 100644
index 1de51e33d87c23..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-4.23.32-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.47-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.47-PM.png
deleted file mode 100644
index df687dabe345c4..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.47-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM-(1).png b/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM-(1).png
deleted file mode 100644
index a8d9ee37c7a558..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM-(1).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM.png b/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM.png
deleted file mode 100644
index a8d9ee37c7a558..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Untitled(1).png b/docs/managed-datahub/imgs/saas/Untitled(1).png
deleted file mode 100644
index 87846e7897f6ed..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Untitled(1).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Untitled-(2)-(1).png b/docs/managed-datahub/imgs/saas/Untitled-(2)-(1).png
deleted file mode 100644
index 7715bf4a51fbe4..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Untitled-(2)-(1).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Untitled-(2).png b/docs/managed-datahub/imgs/saas/Untitled-(2).png
deleted file mode 100644
index a01a1af370442d..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Untitled-(2).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Untitled-(3).png b/docs/managed-datahub/imgs/saas/Untitled-(3).png
deleted file mode 100644
index 02d84b326896c8..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Untitled-(3).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Untitled-(4).png b/docs/managed-datahub/imgs/saas/Untitled-(4).png
deleted file mode 100644
index a01a1af370442d..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Untitled-(4).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/Untitled.png b/docs/managed-datahub/imgs/saas/Untitled.png
deleted file mode 100644
index a01a1af370442d..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/Untitled.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/chrome-store-extension-screenshot.png b/docs/managed-datahub/imgs/saas/chrome-store-extension-screenshot.png
deleted file mode 100644
index e00a4d57f32ddc..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/chrome-store-extension-screenshot.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/extension_custom_configs.png b/docs/managed-datahub/imgs/saas/extension_custom_configs.png
deleted file mode 100644
index b3d70dfac00ff4..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/extension_custom_configs.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/extension_developer_mode.png b/docs/managed-datahub/imgs/saas/extension_developer_mode.png
deleted file mode 100644
index e740d15912e174..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/extension_developer_mode.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/extension_enter_domain.png b/docs/managed-datahub/imgs/saas/extension_enter_domain.png
deleted file mode 100644
index 3304fa168beaf1..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/extension_enter_domain.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/extension_load_unpacked.png b/docs/managed-datahub/imgs/saas/extension_load_unpacked.png
deleted file mode 100644
index 8f56705cd91769..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/extension_load_unpacked.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/extension_open_options_page.png b/docs/managed-datahub/imgs/saas/extension_open_options_page.png
deleted file mode 100644
index c1366d5673b599..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/extension_open_options_page.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/extension_open_popup.png b/docs/managed-datahub/imgs/saas/extension_open_popup.png
deleted file mode 100644
index 216056b847fb51..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/extension_open_popup.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/extension_view_in_looker.png b/docs/managed-datahub/imgs/saas/extension_view_in_looker.png
deleted file mode 100644
index bf854b3e840f7b..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/extension_view_in_looker.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/home-(1).png b/docs/managed-datahub/imgs/saas/home-(1).png
deleted file mode 100644
index 88cf2017dd7e71..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/home-(1).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/home.png b/docs/managed-datahub/imgs/saas/home.png
deleted file mode 100644
index 8ad63deec75c9b..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/home.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(1).png b/docs/managed-datahub/imgs/saas/image-(1).png
deleted file mode 100644
index c1a249125fcf7c..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(1).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(10).png b/docs/managed-datahub/imgs/saas/image-(10).png
deleted file mode 100644
index a580fdc3d67309..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(10).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(11).png b/docs/managed-datahub/imgs/saas/image-(11).png
deleted file mode 100644
index ee95eb43842723..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(11).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(12).png b/docs/managed-datahub/imgs/saas/image-(12).png
deleted file mode 100644
index bbd8e6a66cf85b..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(12).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(13).png b/docs/managed-datahub/imgs/saas/image-(13).png
deleted file mode 100644
index bbd8e6a66cf85b..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(13).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(14).png b/docs/managed-datahub/imgs/saas/image-(14).png
deleted file mode 100644
index a580fdc3d67309..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(14).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(15).png b/docs/managed-datahub/imgs/saas/image-(15).png
deleted file mode 100644
index f282e2d92c1a1d..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(15).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(16).png b/docs/managed-datahub/imgs/saas/image-(16).png
deleted file mode 100644
index 1340c77bd648c8..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(16).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(17).png b/docs/managed-datahub/imgs/saas/image-(17).png
deleted file mode 100644
index 6eee2fb2d821fe..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(17).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(2).png b/docs/managed-datahub/imgs/saas/image-(2).png
deleted file mode 100644
index cf475edd7b95da..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(2).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(3).png b/docs/managed-datahub/imgs/saas/image-(3).png
deleted file mode 100644
index b08818ff3e97c6..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(3).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(4).png b/docs/managed-datahub/imgs/saas/image-(4).png
deleted file mode 100644
index a580fdc3d67309..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(4).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(5).png b/docs/managed-datahub/imgs/saas/image-(5).png
deleted file mode 100644
index 48438c6001e4f5..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(5).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(6).png b/docs/managed-datahub/imgs/saas/image-(6).png
deleted file mode 100644
index 54e569e853f246..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(6).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(7).png b/docs/managed-datahub/imgs/saas/image-(7).png
deleted file mode 100644
index 6e89e5881cfa78..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(7).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(8).png b/docs/managed-datahub/imgs/saas/image-(8).png
deleted file mode 100644
index ee0a3c89d58faa..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(8).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image-(9).png b/docs/managed-datahub/imgs/saas/image-(9).png
deleted file mode 100644
index 301ca98593ef9c..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image-(9).png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/image.png b/docs/managed-datahub/imgs/saas/image.png
deleted file mode 100644
index a1cfc3e74c5dd2..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/image.png and /dev/null differ
diff --git a/docs/managed-datahub/imgs/saas/settings.png b/docs/managed-datahub/imgs/saas/settings.png
deleted file mode 100644
index ca99984abbbc99..00000000000000
Binary files a/docs/managed-datahub/imgs/saas/settings.png and /dev/null differ
diff --git a/docs/managed-datahub/integrations/oidc-sso-integration.md b/docs/managed-datahub/integrations/oidc-sso-integration.md
index 6a9e085186b446..c0f5069d849fa7 100644
--- a/docs/managed-datahub/integrations/oidc-sso-integration.md
+++ b/docs/managed-datahub/integrations/oidc-sso-integration.md
@@ -42,4 +42,8 @@ To enable the OIDC integration, start by navigating to **Settings > Platform > S
4. If there are any advanced settings you would like to configure, click on the **Advanced** button. These come with defaults, so only input settings here if there is something you need changed from the default configuration.
5. Click **Update** to save your settings.
-![](../imgs/saas/image-(10).png)
+
+
+
+
+
diff --git a/docs/managed-datahub/metadata-ingestion-with-acryl/ingestion.md b/docs/managed-datahub/metadata-ingestion-with-acryl/ingestion.md
index 95ca6e5e33e160..e225fd8b014c88 100644
--- a/docs/managed-datahub/metadata-ingestion-with-acryl/ingestion.md
+++ b/docs/managed-datahub/metadata-ingestion-with-acryl/ingestion.md
@@ -56,9 +56,17 @@ In Acryl DataHub deployments, you _must_ use a sink of type `datahub-rest`, whic
2. **token**: a unique API key used to authenticate requests to your instance's REST API
The token can be retrieved by logging in as admin. You can go to Settings page and generate a Personal Access Token with your desired expiration date.
-![](../imgs/saas/home-(1).png)
-![](../imgs/saas/settings.png)
+
+
+
+
+
+
+
+
+
+
To configure your instance of DataHub as the destination for ingestion, set the "server" field of your recipe to point to your Acryl instance's domain suffixed by the path `/gms`, as shown below.
A complete example of a DataHub recipe file, which reads from MySQL and writes into a DataHub instance:
diff --git a/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md b/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md
index d389ec97d05502..6c6cce51ea098b 100644
--- a/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md
+++ b/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md
@@ -17,11 +17,19 @@ Acryl DataHub comes packaged with an Acryl-managed ingestion executor, which is
For example, if an ingestion source is not publicly accessible via the internet, e.g. hosted privately within a specific AWS account, then the Acryl executor will be unable to extract metadata from it.
-![Option 1: Acryl-hosted ingestion runner](../imgs/saas/image-(12).png)
+
+
+
+
+
To accommodate these cases, Acryl supports configuring a remote ingestion executor which can be deployed inside of your AWS account. This setup allows you to continue leveraging the Acryl DataHub console to create, schedule, and run metadata ingestion, all while retaining network and credential isolation.
-![Option 2: Customer-hosted ingestion runner](../imgs/saas/image-(6).png)
+
+
+
+
+
## Deploying a Remote Ingestion Executor
1. **Provide AWS Account Id**: Provide Acryl Team with the id of the AWS in which the remote executor will be hosted. This will be used to grant access to private Acryl containers and create a unique SQS queue which your remote agent will subscribe to. The account id can be provided to your Acryl representative via Email or [One Time Secret](https://onetimesecret.com/).
@@ -40,23 +48,39 @@ To accommodate these cases, Acryl supports configuring a remote ingestion execut
Note that the only external secret provider that is currently supported is AWS Secrets Manager.
-![](../imgs/saas/Screen-Shot-2023-01-19-at-5.12.47-PM.png)
-![](../imgs/saas/Screen-Shot-2023-01-19-at-5.12.56-PM.png)
+
+
+
+
+
+
+
+
+
+
3. **Test the Executor:** To test your remote executor:
1. Create a new Ingestion Source by clicking '**Create new Source**' the '**Ingestion**' tab of the DataHub console. Configure your Ingestion Recipe as though you were running it from inside of your environment.
2. When working with "secret" fields (passwords, keys, etc), you can refer to any "self-managed" secrets by name: `${SECRET_NAME}:`
- ![Using a secret called BQ_DEPLOY_KEY which is managed in AWS secrets manager](../imgs/saas/Screen-Shot-2023-01-19-at-4.16.52-PM.png)
+
+
+
+
+
3. In the 'Finish Up' step, click '**Advanced'**.
4. Update the '**Executor Id**' form field to be '**remote**'. This indicates that you'd like to use the remote executor.
5. Click '**Done**'.
Now, simple click '**Execute**' to test out the remote executor. If your remote executor is configured properly, you should promptly see the ingestion task state change to 'Running'.
-![](../imgs/saas/Screen-Shot-2022-03-07-at-10.23.31-AM.png)
+
+
+
+
+
## Updating a Remote Ingestion Executor
In order to update the executor, ie. to deploy a new container version, you'll need to update the CloudFormation Stack to re-deploy the CloudFormation template with a new set of parameters.
### Steps - AWS Console
@@ -66,7 +90,11 @@ In order to update the executor, ie. to deploy a new container version, you'll n
4. Select **Replace Current Template**
5. Select **Upload a template file**
6. Upload a copy of the Acryl Remote Executor [CloudFormation Template](https://raw.githubusercontent.com/acryldata/datahub-cloudformation/master/Ingestion/templates/python.ecs.template.yaml)
-![](../imgs/saas/Screen-Shot-2023-01-19-at-4.23.32-PM.png)
+
+
+
+
+
7. Click **Next**
8. Change parameters based on your modifications (e.g. ImageTag, etc)
9. Click **Next**
diff --git a/docs/modeling/extending-the-metadata-model.md b/docs/modeling/extending-the-metadata-model.md
index 32951ab2e41ebd..98f70f6d933e40 100644
--- a/docs/modeling/extending-the-metadata-model.md
+++ b/docs/modeling/extending-the-metadata-model.md
@@ -11,7 +11,11 @@ these two concepts prior to making changes.
## To fork or not to fork?
An important question that will arise once you've decided to extend the metadata model is whether you need to fork the main repo or not. Use the diagram below to understand how to make this decision.
-![Metadata Model To Fork or Not](../imgs/metadata-model-to-fork-or-not-to.png)
+
+
+
+
+
The green lines represent pathways that will lead to lesser friction for you to maintain your code long term. The red lines represent higher risk of conflicts in the future. We are working hard to move the majority of model extension use-cases to no-code / low-code pathways to ensure that you can extend the core metadata model without having to maintain a custom fork of DataHub.
@@ -323,7 +327,7 @@ It takes the following parameters:
annotations. To customize the set of analyzers used to index a certain field, you must add a new field type and define
the set of mappings to be applied in the MappingsBuilder.
- Thus far, we have implemented 10 fieldTypes:
+ Thus far, we have implemented 11 fieldTypes:
1. *KEYWORD* - Short text fields that only support exact matches, often used only for filtering
@@ -332,20 +336,25 @@ It takes the following parameters:
3. *TEXT_PARTIAL* - Text fields delimited by spaces/slashes/periods with partial matching support. Note, partial
matching is expensive, so this field type should not be applied to fields with long values (like description)
- 4. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths.
+ 4. *WORD_GRAM* - Text fields delimited by spaces, slashes, periods, dashes, or underscores with partial matching AND
+ word gram support. That is, the text will be split by the delimiters and can be matched with delimited queries
+ matching two, three, or four length tokens in addition to single tokens. As with partial match, this type is
+ expensive, so should not be applied to fields with long values such as description.
+
+ 5. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths.
- 5. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like
+ 6. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like
"urn:li:dataplatform:kafka", it will index the platform name "kafka" and ignore the common components
- 6. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support.
+ 7. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support.
- 7. *BOOLEAN* - Boolean fields used for filtering.
+ 8. *BOOLEAN* - Boolean fields used for filtering.
- 8. *COUNT* - Count fields used for filtering.
+ 9. *COUNT* - Count fields used for filtering.
- 9. *DATETIME* - Datetime fields used to represent timestamps.
+ 10. *DATETIME* - Datetime fields used to represent timestamps.
- 10. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as
+ 11. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as
`field.property` in queries. You should be careful to not use it on objects with many properties as it can cause a
mapping explosion in Elasticsearch.
diff --git a/docs/modeling/metadata-model.md b/docs/modeling/metadata-model.md
index 704fce14123294..037c9c7108a6e5 100644
--- a/docs/modeling/metadata-model.md
+++ b/docs/modeling/metadata-model.md
@@ -30,7 +30,11 @@ Conceptually, metadata is modeled using the following abstractions
Here is an example graph consisting of 3 types of entity (CorpUser, Chart, Dashboard), 2 types of relationship (OwnedBy, Contains), and 3 types of metadata aspect (Ownership, ChartInfo, and DashboardInfo).
-![metadata-modeling](../imgs/metadata-model-chart.png)
+
+
+
+
+
## The Core Entities
@@ -73,7 +77,11 @@ to the YAML configuration, instead of creating new Snapshot / Aspect files.
## Exploring DataHub's Metadata Model
To explore the current DataHub metadata model, you can inspect this high-level picture that shows the different entities and edges between them showing the relationships between them.
-![Metadata Model Graph](../imgs/datahub-metadata-model.png)
+
+
+
+
+
To navigate the aspect model for specific entities and explore relationships using the `foreign-key` concept, you can view them in our demo environment or navigate the auto-generated docs in the **Metadata Modeling/Entities** section on the left.
diff --git a/docs/platform-instances.md b/docs/platform-instances.md
index c6bfe3315de980..0f4515aedae549 100644
--- a/docs/platform-instances.md
+++ b/docs/platform-instances.md
@@ -1,44 +1,48 @@
-# Working With Platform Instances
-
-DataHub's metadata model for Datasets supports a three-part key currently:
-- Data Platform (e.g. urn:li:dataPlatform:mysql)
-- Name (e.g. db.schema.name)
-- Env or Fabric (e.g. DEV, PROD, etc.)
-
-This naming scheme unfortunately does not allow for easy representation of the multiplicity of platforms (or technologies) that might be deployed at an organization within the same environment or fabric. For example, an organization might have multiple Redshift instances in Production and would want to see all the data assets located in those instances inside the DataHub metadata repository.
-
-As part of the `v0.8.24+` releases, we are unlocking the first phase of supporting Platform Instances in the metadata model. This is done via two main additions:
-- The `dataPlatformInstance` aspect that has been added to Datasets which allows datasets to be associated to an instance of a platform
-- Enhancements to all ingestion sources that allow them to attach a platform instance to the recipe that changes the generated urns to go from `urn:li:dataset:(urn:li:dataPlatform:,,ENV)` format to `urn:li:dataset:(urn:li:dataPlatform:,,ENV)` format. Sources that produce lineage to datasets in other platforms (e.g. Looker, Superset etc) also have specific configuration additions that allow the recipe author to specify the mapping between a platform and the instance name that it should be mapped to.
-
-![./imgs/platform-instances-for-ingestion.png](./imgs/platform-instances-for-ingestion.png)
-
-## Naming Platform Instances
-
-When configuring a platform instance, choose an instance name that is understandable and will be stable for the foreseeable future. e.g. `core_warehouse` or `finance_redshift` are allowed names, as are pure guids like `a37dc708-c512-4fe4-9829-401cd60ed789`. Remember that whatever instance name you choose, you will need to specify it in more than one recipe to ensure that the identifiers produced by different sources will line up.
-
-## Enabling Platform Instances
-
-Read the Ingestion source specific guides for how to enable platform instances in each of them.
-The general pattern is to add an additional optional configuration parameter called `platform_instance`.
-
-e.g. here is how you would configure a recipe to ingest a mysql instance that you want to call `core_finance`
-```yaml
-source:
- type: mysql
- config:
- # Coordinates
- host_port: localhost:3306
- platform_instance: core_finance
- database: dbname
-
- # Credentials
- username: root
- password: example
-
-sink:
- # sink configs
-```
-
-
-##
+# Working With Platform Instances
+
+DataHub's metadata model for Datasets supports a three-part key currently:
+- Data Platform (e.g. urn:li:dataPlatform:mysql)
+- Name (e.g. db.schema.name)
+- Env or Fabric (e.g. DEV, PROD, etc.)
+
+This naming scheme unfortunately does not allow for easy representation of the multiplicity of platforms (or technologies) that might be deployed at an organization within the same environment or fabric. For example, an organization might have multiple Redshift instances in Production and would want to see all the data assets located in those instances inside the DataHub metadata repository.
+
+As part of the `v0.8.24+` releases, we are unlocking the first phase of supporting Platform Instances in the metadata model. This is done via two main additions:
+- The `dataPlatformInstance` aspect that has been added to Datasets which allows datasets to be associated to an instance of a platform
+- Enhancements to all ingestion sources that allow them to attach a platform instance to the recipe that changes the generated urns to go from `urn:li:dataset:(urn:li:dataPlatform:,,ENV)` format to `urn:li:dataset:(urn:li:dataPlatform:,,ENV)` format. Sources that produce lineage to datasets in other platforms (e.g. Looker, Superset etc) also have specific configuration additions that allow the recipe author to specify the mapping between a platform and the instance name that it should be mapped to.
+
+
+
+
+
+
+
+## Naming Platform Instances
+
+When configuring a platform instance, choose an instance name that is understandable and will be stable for the foreseeable future. e.g. `core_warehouse` or `finance_redshift` are allowed names, as are pure guids like `a37dc708-c512-4fe4-9829-401cd60ed789`. Remember that whatever instance name you choose, you will need to specify it in more than one recipe to ensure that the identifiers produced by different sources will line up.
+
+## Enabling Platform Instances
+
+Read the Ingestion source specific guides for how to enable platform instances in each of them.
+The general pattern is to add an additional optional configuration parameter called `platform_instance`.
+
+e.g. here is how you would configure a recipe to ingest a mysql instance that you want to call `core_finance`
+```yaml
+source:
+ type: mysql
+ config:
+ # Coordinates
+ host_port: localhost:3306
+ platform_instance: core_finance
+ database: dbname
+
+ # Credentials
+ username: root
+ password: example
+
+sink:
+ # sink configs
+```
+
+
+##
diff --git a/docs/schema-history.md b/docs/schema-history.md
index 9fc9ec1af52bbc..120d041960186e 100644
--- a/docs/schema-history.md
+++ b/docs/schema-history.md
@@ -23,20 +23,32 @@ must have the **View Entity Page** privilege, or be assigned to **any** DataHub
You can view the Schema History for a Dataset by navigating to that Dataset's Schema Tab. As long as that Dataset has more than
one version, you can view what a Dataset looked like at any given version by using the version selector.
Here's an example from DataHub's official Demo environment with the
-[Snowflake pets dataset](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.adoption.pets,PROD)/Schema?is_lineage_mode=false).
+Snowflake pets dataset .
+
+
+
+
+
-![](./imgs/schema-history-latest-version.png)
If you click on an older version in the selector, you'll be able to see what the schema looked like back then. Notice
the changes here to the glossary terms for the `status` field, and to the descriptions for the `created_at` and `updated_at`
fields.
-![](./imgs/schema-history-older-version.png)
+
+
+
+
+
In addition to this, you can also toggle the Audit view that shows you when the most recent changes were made to each field.
You can active this by clicking on the Audit icon you see above the top right of the table.
-![](./imgs/schema-history-audit-activated.png)
+
+
+
+
+
You can see here that some of these fields were added at the oldest dataset version, while some were added only at this latest
version. Some fields were even modified and had a type change at the latest version!
diff --git a/docs/townhall-history.md b/docs/townhall-history.md
index 1da490ca6fa692..e235a70c5d7b95 100644
--- a/docs/townhall-history.md
+++ b/docs/townhall-history.md
@@ -343,8 +343,7 @@ Agenda
- Announcements - 2 mins
- Community Updates ([video](https://youtu.be/r862MZTLAJ0?t=99)) - 10 mins
-- Use-Case: DataHub at Viasat ([slides](demo/ViasatMetadataJourney.pdf),[video](https://youtu.be/2SrDAJnzkjE)) by [Anna Kepler](https://www.linkedin.com/in/akepler) - 15 mins
-- Tech Deep Dive: GraphQL + React RFCs readout and discussion ([slides](https://docs.google.com/presentation/d/e/2PACX-1vRtnINnpi6PvFw7-5iW8PSQoT9Kdf1O_0YW7QAr1_mSdJMNftYFTVCjKL-e3fpe8t6IGkha8UpdmoOI/pub?start=false&loop=false&delayms=3000) ,[video](https://www.youtube.com/watch?v=PrBaFrb7pqA)) by [John Joyce](https://www.linkedin.com/in/john-joyce-759883aa) and [Arun Vasudevan](https://www.linkedin.com/in/arun-vasudevan-55117368/) - 15 mins
+- Use-Case: DataHub at Viasat ([slides](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/ViasatMetadataJourney.pdf),[video](https://youtu.be/2SrDAJnzkjE)) by [Anna Kepler](https://www.linkedin.com/in/akepler) - 15 mins- Tech Deep Dive: GraphQL + React RFCs readout and discussion ([slides](https://docs.google.com/presentation/d/e/2PACX-1vRtnINnpi6PvFw7-5iW8PSQoT9Kdf1O_0YW7QAr1_mSdJMNftYFTVCjKL-e3fpe8t6IGkha8UpdmoOI/pub?start=false&loop=false&delayms=3000) ,[video](https://www.youtube.com/watch?v=PrBaFrb7pqA)) by [John Joyce](https://www.linkedin.com/in/john-joyce-759883aa) and [Arun Vasudevan](https://www.linkedin.com/in/arun-vasudevan-55117368/) - 15 mins
- General Q&A from sign up sheet, slack, and participants - 15 mins
- Closing remarks - 3 mins
- General Q&A from sign up sheet, slack, and participants - 15 mins
@@ -356,8 +355,8 @@ Agenda
Agenda
- Quick intro - 5 mins
-- [Why did Grofers choose DataHub for their data catalog?](demo/Datahub_at_Grofers.pdf) by [Shubham Gupta](https://www.linkedin.com/in/shubhamg931/) - 15 minutes
-- [DataHub UI development - Part 2](demo/Town_Hall_Presentation_-_12-2020_-_UI_Development_Part_2.pdf) by [Charlie Tran](https://www.linkedin.com/in/charlie-tran/) (LinkedIn) - 20 minutes
+- [Why did Grofers choose DataHub for their data catalog?](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Datahub_at_Grofers.pdf) by [Shubham Gupta](https://www.linkedin.com/in/shubhamg931/) - 15 minutes
+- [DataHub UI development - Part 2](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Town_Hall_Presentation_-_12-2020_-_UI_Development_Part_2.pdf) by [Charlie Tran](https://www.linkedin.com/in/charlie-tran/) (LinkedIn) - 20 minutes
- General Q&A from sign up sheet, slack, and participants - 15 mins
- Closing remarks - 5 minutes
@@ -368,9 +367,9 @@ Agenda
Agenda
- Quick intro - 5 mins
-- [Lightning talk on Metadata use-cases at LinkedIn](demo/Metadata_Use-Cases_at_LinkedIn_-_Lightning_Talk.pdf) by [Shirshanka Das](https://www.linkedin.com/in/shirshankadas/) (LinkedIn) - 5 mins
-- [Strongly Consistent Secondary Index (SCSI) in GMA](demo/Datahub_-_Strongly_Consistent_Secondary_Indexing.pdf), an upcoming feature by [Jyoti Wadhwani](https://www.linkedin.com/in/jyotiwadhwani/) (LinkedIn) - 15 minutes
-- [DataHub UI overview](demo/DataHub-UIOverview.pdf) by [Ignacio Bona](https://www.linkedin.com/in/ignaciobona) (LinkedIn) - 20 minutes
+- [Lightning talk on Metadata use-cases at LinkedIn](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Metadata_Use-Cases_at_LinkedIn_-_Lightning_Talk.pdf) by [Shirshanka Das](https://www.linkedin.com/in/shirshankadas/) (LinkedIn) - 5 mins
+- [Strongly Consistent Secondary Index (SCSI) in GMA](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Datahub_-_Strongly_Consistent_Secondary_Indexing.pdf), an upcoming feature by [Jyoti Wadhwani](https://www.linkedin.com/in/jyotiwadhwani/) (LinkedIn) - 15 minutes
+- [DataHub UI overview](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/DataHub-UIOverview.pdf) by [Ignacio Bona](https://www.linkedin.com/in/ignaciobona) (LinkedIn) - 20 minutes
- General Q&A from sign up sheet, slack, and participants - 10 mins
- Closing remarks - 5 minutes
@@ -382,8 +381,8 @@ Agenda
Agenda
- Quick intro - 5 mins
-- [Data Discoverability at SpotHero](demo/Data_Discoverability_at_SpotHero.pdf) by [Maggie Hays](https://www.linkedin.com/in/maggie-hays/) (SpotHero) - 20 mins
-- [Designing the next generation of metadata events for scale](demo/Designing_the_next_generation_of_metadata_events_for_scale.pdf) by [Chris Lee](https://www.linkedin.com/in/chrisleecmu/) (LinkedIn) - 15 mins
+- [Data Discoverability at SpotHero](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Data_Discoverability_at_SpotHero.pdf) by [Maggie Hays](https://www.linkedin.com/in/maggie-hays/) (SpotHero) - 20 mins
+- [Designing the next generation of metadata events for scale](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Designing_the_next_generation_of_metadata_events_for_scale.pdf) by [Chris Lee](https://www.linkedin.com/in/chrisleecmu/) (LinkedIn) - 15 mins
- General Q&A from sign up sheet, slack, and participants - 15 mins
- Closing remarks - 5 mins
diff --git a/docs/ui-ingestion.md b/docs/ui-ingestion.md
index 4435f66e514f33..2ecb1e634c79f1 100644
--- a/docs/ui-ingestion.md
+++ b/docs/ui-ingestion.md
@@ -14,11 +14,19 @@ This document will describe the steps required to configure, schedule, and execu
To view & manage UI-based metadata ingestion, you must have the `Manage Metadata Ingestion` & `Manage Secrets`
privileges assigned to your account. These can be granted by a [Platform Policy](authorization/policies.md).
-![](./imgs/ingestion-privileges.png)
+
+
+
+
+
Once you have these privileges, you can begin to manage ingestion by navigating to the 'Ingestion' tab in DataHub.
-![](./imgs/ingestion-tab.png)
+
+
+
+
+
On this page, you'll see a list of active **Ingestion Sources**. An Ingestion Sources is a unique source of metadata ingested
into DataHub from an external source like Snowflake, Redshift, or BigQuery.
@@ -33,7 +41,11 @@ your first **Ingestion Source**.
Before ingesting any metadata, you need to create a new Ingestion Source. Start by clicking **+ Create new source**.
-![](./imgs/create-new-ingestion-source-button.png)
+
+
+
+
+
#### Step 1: Select a Platform Template
@@ -41,7 +53,11 @@ In the first step, select a **Recipe Template** corresponding to the source type
a variety of natively supported integrations, from Snowflake to Postgres to Kafka.
Select `Custom` to construct an ingestion recipe from scratch.
-![](./imgs/select-platform-template.png)
+
+
+
+
+
Next, you'll configure an ingestion **Recipe**, which defines _how_ and _what_ to extract from the source system.
@@ -68,7 +84,11 @@ used by DataHub to extract metadata from a 3rd party system. It most often consi
A sample of a full recipe configured to ingest metadata from MySQL can be found in the image below.
-![](./imgs/example-mysql-recipe.png)
+
+
+
+
+
Detailed configuration examples & documentation for each source type can be found on the [DataHub Docs](https://datahubproject.io/docs/metadata-ingestion/) website.
@@ -80,7 +100,11 @@ that are encrypted and stored within DataHub's storage layer.
To create a secret, first navigate to the 'Secrets' tab. Then click `+ Create new secret`.
-![](./imgs/create-secret.png)
+
+
+
+
+
_Creating a Secret to store the username for a MySQL database_
@@ -123,7 +147,11 @@ Secret values are not persisted to disk beyond execution time, and are never tra
Next, you can optionally configure a schedule on which to execute your new Ingestion Source. This enables to schedule metadata extraction on a monthly, weekly, daily, or hourly cadence depending on the needs of your organization.
Schedules are defined using CRON format.
-![](./imgs/schedule-ingestion.png)
+
+
+
+
+
_An Ingestion Source that is executed at 9:15am every day, Los Angeles time_
@@ -136,7 +164,11 @@ you can always come back and change this.
Finally, give your Ingestion Source a name.
-![](./imgs/name-ingestion-source.png)
+
+
+
+
+
Once you're happy with your configurations, click 'Done' to save your changes.
@@ -149,7 +181,11 @@ with the server. However, you can override the default package version using the
To do so, simply click 'Advanced', then change the 'CLI Version' text box to contain the exact version
of the DataHub CLI you'd like to use.
-![](./imgs/custom-ingestion-cli-version.png)
+
+
+
+
+
_Pinning the CLI version to version `0.8.23.2`_
Once you're happy with your changes, simply click 'Done' to save.
@@ -200,11 +236,19 @@ Once you've created your Ingestion Source, you can run it by clicking 'Execute'.
you should see the 'Last Status' column of the ingestion source change from `N/A` to `Running`. This
means that the request to execute ingestion has been successfully picked up by the DataHub ingestion executor.
-![](./imgs/running-ingestion.png)
+
+
+
+
+
If ingestion has executed successfully, you should see it's state shown in green as `Succeeded`.
-![](./imgs/successful-ingestion.png)
+
+
+
+
+
### Cancelling an Ingestion Run
@@ -212,14 +256,22 @@ If ingestion has executed successfully, you should see it's state shown in green
If your ingestion run is hanging, there may a bug in the ingestion source, or another persistent issue like exponential timeouts. If these situations,
you can cancel ingestion by clicking **Cancel** on the problematic run.
-![](./imgs/cancelled-ingestion.png)
+
+
+
+
+
Once cancelled, you can view the output of the ingestion run by clicking **Details**.
### Debugging a Failed Ingestion Run
-![](./imgs/failed-ingestion.png)
+
+
+
+
+
A variety of things can cause an ingestion run to fail. Common reasons for failure include:
@@ -235,12 +287,20 @@ A variety of things can cause an ingestion run to fail. Common reasons for failu
4. **Authentication**: If you've enabled [Metadata Service Authentication](authentication/introducing-metadata-service-authentication.md), you'll need to provide a Personal Access Token
in your Recipe Configuration. To so this, set the 'token' field of the sink configuration to contain a Personal Access Token:
- ![](./imgs/ingestion-with-token.png)
+
+
+
+
+
The output of each run is captured and available to view in the UI for easier debugging. To view output logs, click **DETAILS**
on the corresponding ingestion run.
-![](./imgs/ingestion-logs.png)
+
+
+
+
+
## FAQ
@@ -250,7 +310,11 @@ If not due to one of the reasons outlined above, this may be because the executo
to reach DataHub's backend using the default configurations. Try changing your ingestion recipe to make the `sink.config.server` variable point to the Docker
DNS name for the `datahub-gms` pod:
-![](./imgs/quickstart-ingestion-config.png)
+
+
+
+
+
### I see 'N/A' when I try to run ingestion. What do I do?
diff --git a/docs/what/relationship.md b/docs/what/relationship.md
index 1908bbd6ce75f0..dcfe093a1b1245 100644
--- a/docs/what/relationship.md
+++ b/docs/what/relationship.md
@@ -2,7 +2,11 @@
A relationship is a named associate between exactly two [entities](entity.md), a source and a destination.
-![metadata-modeling](../imgs/metadata-modeling.png)
+
+
+
+
+
From the above graph, a `Group` entity can be linked to a `User` entity via a `HasMember` relationship.
Note that the name of the relationship reflects the direction, i.e. pointing from `Group` to `User`.
diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java b/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java
index 2ffd9283ed4569..8f2f42cd69caee 100644
--- a/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java
+++ b/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java
@@ -155,7 +155,8 @@ private void extractSearchableAnnotation(final Object annotationObj, final DataS
annotation.getBoostScore(),
annotation.getHasValuesFieldName(),
annotation.getNumValuesFieldName(),
- annotation.getWeightsPerFieldValue());
+ annotation.getWeightsPerFieldValue(),
+ annotation.getFieldNameAliases());
}
}
log.debug("Searchable annotation for field: {} : {}", schemaPathSpec, annotation);
diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java
index f2e65c771c6eb2..d5e5044f95c238 100644
--- a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java
+++ b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java
@@ -4,7 +4,10 @@
import com.google.common.collect.ImmutableSet;
import com.linkedin.data.schema.DataSchema;
import com.linkedin.metadata.models.ModelValidationException;
+
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
@@ -19,9 +22,10 @@
@Value
public class SearchableAnnotation {
+ public static final String FIELD_NAME_ALIASES = "fieldNameAliases";
public static final String ANNOTATION_NAME = "Searchable";
private static final Set DEFAULT_QUERY_FIELD_TYPES =
- ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.URN, FieldType.URN_PARTIAL);
+ ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.WORD_GRAM, FieldType.URN, FieldType.URN_PARTIAL);
// Name of the field in the search index. Defaults to the field name in the schema
String fieldName;
@@ -47,6 +51,8 @@ public class SearchableAnnotation {
Optional numValuesFieldName;
// (Optional) Weights to apply to score for a given value
Map weightsPerFieldValue;
+ // (Optional) Aliases for this given field that can be used for sorting etc.
+ List fieldNameAliases;
public enum FieldType {
KEYWORD,
@@ -59,7 +65,8 @@ public enum FieldType {
COUNT,
DATETIME,
OBJECT,
- BROWSE_PATH_V2
+ BROWSE_PATH_V2,
+ WORD_GRAM
}
@Nonnull
@@ -93,6 +100,7 @@ public static SearchableAnnotation fromPegasusAnnotationObject(@Nonnull final Ob
final Optional numValuesFieldName = AnnotationUtils.getField(map, "numValuesFieldName", String.class);
final Optional weightsPerFieldValueMap =
AnnotationUtils.getField(map, "weightsPerFieldValue", Map.class).map(m -> (Map) m);
+ final List fieldNameAliases = getFieldNameAliases(map);
final FieldType resolvedFieldType = getFieldType(fieldType, schemaDataType);
return new SearchableAnnotation(
@@ -107,7 +115,8 @@ public static SearchableAnnotation fromPegasusAnnotationObject(@Nonnull final Ob
boostScore.orElse(1.0),
hasValuesFieldName,
numValuesFieldName,
- weightsPerFieldValueMap.orElse(ImmutableMap.of()));
+ weightsPerFieldValueMap.orElse(ImmutableMap.of()),
+ fieldNameAliases);
}
private static FieldType getFieldType(Optional maybeFieldType, DataSchema.Type schemaDataType) {
@@ -155,4 +164,15 @@ private static String capitalizeFirstLetter(String str) {
return str.substring(0, 1).toUpperCase() + str.substring(1);
}
}
+
+ private static List getFieldNameAliases(Map map) {
+ final List aliases = new ArrayList<>();
+ final Optional fieldNameAliases = AnnotationUtils.getField(map, FIELD_NAME_ALIASES, List.class);
+ if (fieldNameAliases.isPresent()) {
+ for (Object alias : fieldNameAliases.get()) {
+ aliases.add((String) alias);
+ }
+ }
+ return aliases;
+ }
}
diff --git a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java
index 1ab5ff640ce327..3618108970afa6 100644
--- a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java
+++ b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java
@@ -142,7 +142,7 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) {
assertEquals(new TestEntityInfo().schema().getFullName(), testEntityInfo.getPegasusSchema().getFullName());
// Assert on Searchable Fields
- assertEquals(9, testEntityInfo.getSearchableFieldSpecs().size());
+ assertEquals(testEntityInfo.getSearchableFieldSpecs().size(), 10);
assertEquals("customProperties", testEntityInfo.getSearchableFieldSpecMap().get(
new PathSpec("customProperties").toString()).getSearchableAnnotation().getFieldName());
assertEquals(SearchableAnnotation.FieldType.KEYWORD, testEntityInfo.getSearchableFieldSpecMap().get(
@@ -158,6 +158,11 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) {
assertEquals(SearchableAnnotation.FieldType.TEXT_PARTIAL, testEntityInfo.getSearchableFieldSpecMap().get(
new PathSpec("textArrayField", "*").toString())
.getSearchableAnnotation().getFieldType());
+ assertEquals("wordGramField", testEntityInfo.getSearchableFieldSpecMap().get(
+ new PathSpec("wordGramField").toString()).getSearchableAnnotation().getFieldName());
+ assertEquals(SearchableAnnotation.FieldType.WORD_GRAM, testEntityInfo.getSearchableFieldSpecMap().get(
+ new PathSpec("wordGramField").toString())
+ .getSearchableAnnotation().getFieldType());
assertEquals("nestedIntegerField", testEntityInfo.getSearchableFieldSpecMap().get(
new PathSpec("nestedRecordField", "nestedIntegerField").toString()).getSearchableAnnotation().getFieldName());
assertEquals(SearchableAnnotation.FieldType.COUNT, testEntityInfo.getSearchableFieldSpecMap().get(
diff --git a/metadata-ingestion/adding-source.md b/metadata-ingestion/adding-source.md
index 50e6a1cd5fcc6a..e4fc950a7cdbd0 100644
--- a/metadata-ingestion/adding-source.md
+++ b/metadata-ingestion/adding-source.md
@@ -44,7 +44,11 @@ class LookerAPIConfig(ConfigModel):
```
generates the following documentation:
-![Generated Config Documentation](./docs/images/generated_config_docs.png)
+
+
+
+
+
:::note
Inline markdown or code snippets are not yet supported for field level documentation.
diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle
index f636cf25c67f72..199ccc59c21e04 100644
--- a/metadata-ingestion/build.gradle
+++ b/metadata-ingestion/build.gradle
@@ -21,11 +21,13 @@ task checkPythonVersion(type: Exec) {
}
task environmentSetup(type: Exec, dependsOn: checkPythonVersion) {
+ def sentinel_file = "${venv_name}/.venv_environment_sentinel"
inputs.file file('setup.py')
- outputs.dir("${venv_name}")
+ outputs.file(sentinel_file)
commandLine 'bash', '-c',
"${python_executable} -m venv ${venv_name} && " +
- "${venv_name}/bin/python -m pip install --upgrade pip wheel 'setuptools>=63.0.0'"
+ "${venv_name}/bin/python -m pip install --upgrade pip wheel 'setuptools>=63.0.0' && " +
+ "touch ${sentinel_file}"
}
task runPreFlightScript(type: Exec, dependsOn: environmentSetup) {
@@ -39,7 +41,6 @@ task runPreFlightScript(type: Exec, dependsOn: environmentSetup) {
task installPackageOnly(type: Exec, dependsOn: runPreFlightScript) {
def sentinel_file = "${venv_name}/.build_install_package_only_sentinel"
inputs.file file('setup.py')
- outputs.dir("${venv_name}")
outputs.file(sentinel_file)
commandLine 'bash', '-x', '-c',
"${venv_name}/bin/pip install -e . &&" +
@@ -47,9 +48,12 @@ task installPackageOnly(type: Exec, dependsOn: runPreFlightScript) {
}
task installPackage(type: Exec, dependsOn: installPackageOnly) {
+ def sentinel_file = "${venv_name}/.build_install_package_sentinel"
inputs.file file('setup.py')
- outputs.dir("${venv_name}")
- commandLine 'bash', '-x', '-c', "${venv_name}/bin/pip install -e . ${extra_pip_requirements}"
+ outputs.file(sentinel_file)
+ commandLine 'bash', '-x', '-c',
+ "${venv_name}/bin/pip install -e . ${extra_pip_requirements} && " +
+ "touch ${sentinel_file}"
}
task codegen(type: Exec, dependsOn: [environmentSetup, installPackage, ':metadata-events:mxe-schemas:build']) {
@@ -63,7 +67,6 @@ task install(dependsOn: [installPackage, codegen])
task installDev(type: Exec, dependsOn: [install]) {
def sentinel_file = "${venv_name}/.build_install_dev_sentinel"
inputs.file file('setup.py')
- outputs.dir("${venv_name}")
outputs.file(sentinel_file)
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
@@ -75,7 +78,6 @@ task installDev(type: Exec, dependsOn: [install]) {
task installAll(type: Exec, dependsOn: [install]) {
def sentinel_file = "${venv_name}/.build_install_all_sentinel"
inputs.file file('setup.py')
- outputs.dir("${venv_name}")
outputs.file(sentinel_file)
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
diff --git a/metadata-ingestion/developing.md b/metadata-ingestion/developing.md
index 67041d23a21b13..5d49b9a866a3d9 100644
--- a/metadata-ingestion/developing.md
+++ b/metadata-ingestion/developing.md
@@ -74,7 +74,9 @@ The syntax for installing plugins is slightly different in development. For exam
## Architecture
-![metadata ingestion framework layout](../docs/imgs/datahub-metadata-ingestion-framework.png)
+
+
+
The architecture of this metadata ingestion framework is heavily inspired by [Apache Gobblin](https://gobblin.apache.org/) (also originally a LinkedIn project!). We have a standardized format - the MetadataChangeEvent - and sources and sinks which respectively produce and consume these objects. The sources pull metadata from a variety of data systems, while the sinks are primarily for moving this metadata into DataHub.
diff --git a/metadata-ingestion/docs/dev_guides/stateful.md b/metadata-ingestion/docs/dev_guides/stateful.md
index eccacbb416714b..b3a409e965c629 100644
--- a/metadata-ingestion/docs/dev_guides/stateful.md
+++ b/metadata-ingestion/docs/dev_guides/stateful.md
@@ -38,7 +38,9 @@ Following is the list of current use-cases powered by stateful ingestion in data
Stateful ingestion can be used to automatically soft-delete the tables and views that are seen in a previous run
but absent in the current run (they are either deleted or no longer desired).
-![Stale Metadata Deletion](./stale_metadata_deletion.png)
+
+
+
#### Supported sources
* All sql based sources.
diff --git a/metadata-ingestion/docs/sources/azure-ad/azure-ad.md b/metadata-ingestion/docs/sources/azure-ad/azure-ad.md
index 8b375fbee4f33c..d2677d7e4fc7a3 100644
--- a/metadata-ingestion/docs/sources/azure-ad/azure-ad.md
+++ b/metadata-ingestion/docs/sources/azure-ad/azure-ad.md
@@ -5,6 +5,15 @@ to read your organization's Users and Groups. The following permissions are requ
- `GroupMember.Read.All`
- `User.Read.All`
-You can add a permission by navigating to the permissions tab in your DataHub application on the Azure AD portal. ![Azure AD API Permissions](./azure_ad_api_permissions.png)
+You can add a permission by navigating to the permissions tab in your DataHub application on the Azure AD portal.
+
+
+
-You can view the necessary endpoints to configure by clicking on the Endpoints button in the Overview tab. ![Azure AD Endpoints](./azure_ad_endpoints.png)
+
+You can view the necessary endpoints to configure by clicking on the Endpoints button in the Overview tab.
+
+
+
+
+
diff --git a/metadata-ingestion/docs/sources/databricks/README.md b/metadata-ingestion/docs/sources/databricks/README.md
index 01aee3236e01c2..b380a892c22b9d 100644
--- a/metadata-ingestion/docs/sources/databricks/README.md
+++ b/metadata-ingestion/docs/sources/databricks/README.md
@@ -15,8 +15,11 @@ To complete the picture, we recommend adding push-based ingestion from your Spar
## Watch the DataHub Talk at the Data and AI Summit 2022
For a deeper look at how to think about DataHub within and across your Databricks ecosystem, watch the recording of our talk at the Data and AI Summit 2022.
-
-[![IMAGE_ALT](../../images/databricks/data_and_ai_summit_2022.png)](https://www.youtube.com/watch?v=SCP0PR3t7dc)
+
+
+
+
+
diff --git a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md
index 9d400460407c8c..03bcef70e18604 100644
--- a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md
+++ b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md
@@ -1,5 +1,60 @@
## Advanced Configurations
+### Working with Platform Instances
+If you've multiple instances of kafka OR source/sink systems that are referred in your `kafka-connect` setup, you'd need to configure platform instance for these systems in `kafka-connect` recipe to generate correct lineage edges. You must have already set `platform_instance` in recipes of original source/sink systems. Refer the document [Working with Platform Instances](https://datahubproject.io/docs/platform-instances) to understand more about this.
+
+There are two options available to declare source/sink system's `platform_instance` in `kafka-connect` recipe. If single instance of platform is used across all `kafka-connect` connectors, you can use `platform_instance_map` to specify platform_instance to use for a platform when constructing URNs for lineage.
+
+Example:
+```yml
+ # Map of platform name to platform instance
+ platform_instance_map:
+ snowflake: snowflake_platform_instance
+ mysql: mysql_platform_instance
+
+```
+If multiple instances of platform are used across `kafka-connect` connectors, you'd need to specify platform_instance to use for platform for every connector.
+
+#### Example - Multiple MySQL Source Connectors each reading from different mysql instance
+```yml
+ # Map of platform name to platform instance per connector
+ connect_to_platform_map:
+ mysql_connector1:
+ mysql: mysql_instance1
+
+ mysql_connector2:
+ mysql: mysql_instance2
+```
+Here mysql_connector1 and mysql_connector2 are names of MySQL source connectors as defined in `kafka-connect` connector config.
+
+#### Example - Multiple MySQL Source Connectors each reading from difference mysql instance and writing to different kafka cluster
+```yml
+ connect_to_platform_map:
+ mysql_connector1:
+ mysql: mysql_instance1
+ kafka: kafka_instance1
+
+ mysql_connector2:
+ mysql: mysql_instance2
+ kafka: kafka_instance2
+```
+You can also use combination of `platform_instance_map` and `connect_to_platform_map` in your recipe. Note that, the platform_instance specified for the connector in `connect_to_platform_map` will always take higher precedance even if platform_instance for same platform is set in `platform_instance_map`.
+
+If you do not use `platform_instance` in original source/sink recipes, you do not need to specify them in above configurations.
+
+Note that, you do not need to specify platform_instance for BigQuery.
+
+#### Example - Multiple BigQuery Sink Connectors each writing to different kafka cluster
+```yml
+ connect_to_platform_map:
+ bigquery_connector1:
+ kafka: kafka_instance1
+
+ bigquery_connector2:
+ kafka: kafka_instance2
+```
+
+### Provided Configurations from External Sources
Kafka Connect supports pluggable configuration providers which can load configuration data from external sources at runtime. These values are not available to DataHub ingestion source through Kafka Connect APIs. If you are using such provided configurations to specify connection url (database, etc) in Kafka Connect connector configuration then you will need also add these in `provided_configs` section in recipe for DataHub to generate correct lineage.
```yml
diff --git a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml
index f5e33e661622d8..cacbda5ca078a3 100644
--- a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml
+++ b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml
@@ -3,14 +3,16 @@ source:
config:
# Coordinates
connect_uri: "http://localhost:8083"
-
+
# Credentials
username: admin
password: password
# Optional
- platform_instance_map:
- bigquery: bigquery_platform_instance_id
-
+ # Platform instance mapping to use when constructing URNs.
+ # Use if single instance of platform is referred across connectors.
+ platform_instance_map:
+ mysql: mysql_platform_instance
+
sink:
- # sink configs
\ No newline at end of file
+ # sink configs
diff --git a/metadata-ingestion/docs/sources/looker/looker_datahub_permission_set.png b/metadata-ingestion/docs/sources/looker/looker_datahub_permission_set.png
deleted file mode 100644
index 7227dc04fb8a0a..00000000000000
Binary files a/metadata-ingestion/docs/sources/looker/looker_datahub_permission_set.png and /dev/null differ
diff --git a/metadata-ingestion/docs/sources/looker/looker_pre.md b/metadata-ingestion/docs/sources/looker/looker_pre.md
index ad7fff9c0daafe..6798103d66e994 100644
--- a/metadata-ingestion/docs/sources/looker/looker_pre.md
+++ b/metadata-ingestion/docs/sources/looker/looker_pre.md
@@ -19,7 +19,10 @@ see_user_dashboards
see_users
```
Here is an example permission set after configuration.
-![Looker DataHub Permission Set](./looker_datahub_permission_set.png)
+
+
+
+
#### Get an API key
diff --git a/metadata-ingestion/docs/sources/mssql/mssql_pre.md b/metadata-ingestion/docs/sources/mssql/mssql_pre.md
new file mode 100644
index 00000000000000..396581966e691b
--- /dev/null
+++ b/metadata-ingestion/docs/sources/mssql/mssql_pre.md
@@ -0,0 +1,14 @@
+### Prerequisites
+
+If you want to ingest MSSQL Jobs and stored procedures (with code) the user credentials needs the proper privileges.
+
+Script for granting the privileges:
+```
+USE MSDB
+GRANT SELECT ON OBJECT::msdb.dbo.sysjobsteps TO 'USERNAME'
+GRANT SELECT ON OBJECT::msdb.dbo.sysjobs TO 'USERNAME'
+
+USE 'DATA_DB_NAME'
+GRANT VIEW DEFINITION TO 'USERNAME'
+GRANT SELECT ON OBJECT::sys.sql_expression_dependencies TO 'USERNAME'
+```
\ No newline at end of file
diff --git a/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md b/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md
index 9a381fb351aecd..75bd579417a48f 100644
--- a/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md
+++ b/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md
@@ -99,6 +99,24 @@ The steps slightly differ based on which you decide to use.
including `client_id` and `client_secret`, plus your Okta user's `Username` and `Password`
* Note: the `username` and `password` config options are not nested under `oauth_config`
+### Snowflake Shares
+If you are using [Snowflake Shares](https://docs.snowflake.com/en/user-guide/data-sharing-provider) to share data across different snowflake accounts, and you have set up DataHub recipes for ingesting metadata from all these accounts, you may end up having multiple similar dataset entities corresponding to virtual versions of same table in different snowflake accounts. DataHub Snowflake connector can automatically link such tables together through Siblings and Lineage relationship if user provides information necessary to establish the relationship using configuration `shares` in recipe.
+
+#### Example
+- Snowflake account `account1` (ingested as platform_instance `instance1`) owns a database `db1`. A share `X` is created in `account1` that includes database `db1` along with schemas and tables inside it.
+- Now, `X` is shared with snowflake account `account2` (ingested as platform_instance `instance2`). A database `db1_from_X` is created from inbound share `X` in `account2`. In this case, all tables and views included in share `X` will also be present in `instance2`.`db1_from_X`.
+- This can be represented in `shares` configuration section as
+ ```yaml
+ shares:
+ X: # name of the share
+ database_name: db1
+ platform_instance: instance1
+ consumers: # list of all databases created from share X
+ - database_name: db1_from_X
+ platform_instance: instance2
+
+ ```
+- If share `X` is shared with more snowflake accounts and database is created from share `X` in those account then additional entries need to be added in `consumers` list for share `X`, one per snowflake account. The same `shares` config can then be copied across recipes of all accounts.
### Caveats
- Some of the features are only available in the Snowflake Enterprise Edition. This doc has notes mentioning where this applies.
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index 4ff1d06bb8c22e..ded9186e08a228 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -376,6 +376,7 @@ def get_long_description():
"salesforce": {"simple-salesforce"},
"snowflake": snowflake_common | usage_common | sqlglot_lib,
"sqlalchemy": sql_common,
+ "sql-queries": usage_common | sqlglot_lib,
"superset": {
"requests",
"sqlalchemy",
@@ -388,7 +389,7 @@ def get_long_description():
"trino": sql_common | trino,
"starburst-trino-usage": sql_common | usage_common | trino,
"nifi": {"requests", "packaging", "requests-gssapi"},
- "powerbi": microsoft_common | {"lark[regex]==1.1.4", "sqlparse"},
+ "powerbi": microsoft_common | {"lark[regex]==1.1.4", "sqlparse"} | sqlglot_lib,
"powerbi-report-server": powerbi_report_server,
"vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8"},
"unity-catalog": databricks | sqllineage_lib,
@@ -454,7 +455,7 @@ def get_long_description():
"mypy==1.0.0",
# pydantic 1.8.2 is incompatible with mypy 0.910.
# See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910.
- "pydantic>=1.9.0",
+ "pydantic>=1.10.0",
*test_api_requirements,
pytest_dep,
"pytest-asyncio>=0.16.0",
@@ -608,6 +609,7 @@ def get_long_description():
"demo-data = datahub.ingestion.source.demo_data.DemoDataSource",
"unity-catalog = datahub.ingestion.source.unity.source:UnityCatalogSource",
"gcs = datahub.ingestion.source.gcs.gcs_source:GCSSource",
+ "sql-queries = datahub.ingestion.source.sql_queries:SqlQueriesSource",
],
"datahub.ingestion.transformer.plugins": [
"simple_remove_dataset_ownership = datahub.ingestion.transformer.remove_dataset_ownership:SimpleRemoveDatasetOwnership",
diff --git a/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py
new file mode 100644
index 00000000000000..071d590f270f8b
--- /dev/null
+++ b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py
@@ -0,0 +1,289 @@
+import logging
+import time
+from collections import defaultdict
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Collection, Dict, Iterable, List, Optional, Set
+
+from datahub.emitter.mce_builder import make_schema_field_urn
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.usage.usage_common import BaseUsageConfig, UsageAggregator
+from datahub.metadata.schema_classes import (
+ AuditStampClass,
+ DatasetLineageTypeClass,
+ FineGrainedLineageClass,
+ FineGrainedLineageDownstreamTypeClass,
+ FineGrainedLineageUpstreamTypeClass,
+ OperationClass,
+ OperationTypeClass,
+ UpstreamClass,
+ UpstreamLineageClass,
+)
+from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult
+
+logger = logging.getLogger(__name__)
+
+# TODO: Use this over other sources' equivalent code, if possible
+
+DatasetUrn = str
+FieldUrn = str
+UserUrn = str
+
+
+@dataclass
+class LineageEdge:
+ """Stores information about a single lineage edge, from an upstream table to a downstream table."""
+
+ downstream_urn: DatasetUrn
+ upstream_urn: DatasetUrn
+ audit_stamp: Optional[datetime]
+ actor: Optional[UserUrn]
+ type: str = DatasetLineageTypeClass.TRANSFORMED
+
+ # Maps downstream_col -> {upstream_col}
+ column_map: Dict[str, Set[str]] = field(default_factory=lambda: defaultdict(set))
+
+ def gen_upstream_aspect(self) -> UpstreamClass:
+ return UpstreamClass(
+ auditStamp=AuditStampClass(
+ time=int(self.audit_stamp.timestamp() * 1000), actor=self.actor or ""
+ )
+ if self.audit_stamp
+ else None,
+ dataset=self.upstream_urn,
+ type=self.type,
+ )
+
+ def gen_fine_grained_lineage_aspects(self) -> Iterable[FineGrainedLineageClass]:
+ for downstream_col, upstream_cols in self.column_map.items():
+ yield FineGrainedLineageClass(
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
+ # Sort to avoid creating multiple aspects in backend with same lineage but different order
+ upstreams=sorted(
+ make_schema_field_urn(self.upstream_urn, col)
+ for col in upstream_cols
+ ),
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
+ downstreams=[
+ make_schema_field_urn(self.downstream_urn, downstream_col)
+ ],
+ )
+
+
+@dataclass
+class SqlParsingBuilder:
+ # Open question: does it make sense to iterate over out_tables? When will we have multiple?
+
+ generate_lineage: bool = True
+ generate_usage_statistics: bool = True
+ generate_operations: bool = True
+ usage_config: Optional[BaseUsageConfig] = None
+
+ # TODO: Make inner dict a FileBackedDict and make LineageEdge frozen
+ # Builds up a single LineageEdge for each upstream -> downstream pair
+ _lineage_map: Dict[DatasetUrn, Dict[DatasetUrn, LineageEdge]] = field(
+ default_factory=lambda: defaultdict(dict), init=False
+ )
+
+ # TODO: Replace with FileBackedDict approach like in BigQuery usage
+ _usage_aggregator: UsageAggregator[DatasetUrn] = field(init=False)
+
+ def __post_init__(self) -> None:
+ if self.usage_config:
+ self._usage_aggregator = UsageAggregator(self.usage_config)
+ else:
+ logger.info("No usage config provided, not generating usage statistics")
+ self.generate_usage_statistics = False
+
+ def process_sql_parsing_result(
+ self,
+ result: SqlParsingResult,
+ *,
+ query: str,
+ query_timestamp: Optional[datetime] = None,
+ is_view_ddl: bool = False,
+ user: Optional[UserUrn] = None,
+ custom_operation_type: Optional[str] = None,
+ include_urns: Optional[Set[DatasetUrn]] = None,
+ ) -> Iterable[MetadataWorkUnit]:
+ """Process a single query and yield any generated workunits.
+
+ Args:
+ result: The result of parsing the query, or a mock result if parsing failed.
+ query: The SQL query to parse and process.
+ query_timestamp: When the query was run.
+ is_view_ddl: Whether the query is a DDL statement that creates a view.
+ user: The urn of the user who ran the query.
+ custom_operation_type: Platform-specific operation type, used if the operation type can't be parsed.
+ include_urns: If provided, only generate workunits for these urns.
+ """
+ downstreams_to_ingest = result.out_tables
+ upstreams_to_ingest = result.in_tables
+ if include_urns:
+ logger.debug(f"Skipping urns {set(downstreams_to_ingest) - include_urns}")
+ downstreams_to_ingest = list(set(downstreams_to_ingest) & include_urns)
+ upstreams_to_ingest = list(set(upstreams_to_ingest) & include_urns)
+
+ if self.generate_lineage:
+ for downstream_urn in downstreams_to_ingest:
+ _merge_lineage_data(
+ downstream_urn=downstream_urn,
+ upstream_urns=result.in_tables,
+ column_lineage=result.column_lineage,
+ upstream_edges=self._lineage_map[downstream_urn],
+ query_timestamp=query_timestamp,
+ is_view_ddl=is_view_ddl,
+ user=user,
+ )
+
+ if self.generate_usage_statistics and query_timestamp is not None:
+ upstream_fields = _compute_upstream_fields(result)
+ for upstream_urn in upstreams_to_ingest:
+ self._usage_aggregator.aggregate_event(
+ resource=upstream_urn,
+ start_time=query_timestamp,
+ query=query,
+ user=user,
+ fields=sorted(upstream_fields.get(upstream_urn, [])),
+ )
+
+ if self.generate_operations and query_timestamp is not None:
+ for downstream_urn in downstreams_to_ingest:
+ yield from _gen_operation_workunit(
+ result,
+ downstream_urn=downstream_urn,
+ query_timestamp=query_timestamp,
+ user=user,
+ custom_operation_type=custom_operation_type,
+ )
+
+ def add_lineage(
+ self,
+ downstream_urn: DatasetUrn,
+ upstream_urns: Collection[DatasetUrn],
+ timestamp: Optional[datetime] = None,
+ is_view_ddl: bool = False,
+ user: Optional[UserUrn] = None,
+ ) -> None:
+ """Manually add a single upstream -> downstream lineage edge, e.g. if sql parsing fails."""
+ _merge_lineage_data(
+ downstream_urn=downstream_urn,
+ upstream_urns=upstream_urns,
+ column_lineage=None,
+ upstream_edges=self._lineage_map[downstream_urn],
+ query_timestamp=timestamp,
+ is_view_ddl=is_view_ddl,
+ user=user,
+ )
+
+ def gen_workunits(self) -> Iterable[MetadataWorkUnit]:
+ if self.generate_lineage:
+ yield from self._gen_lineage_workunits()
+ if self.generate_usage_statistics:
+ yield from self._gen_usage_statistics_workunits()
+
+ def _gen_lineage_workunits(self) -> Iterable[MetadataWorkUnit]:
+ for downstream_urn in self._lineage_map:
+ upstreams: List[UpstreamClass] = []
+ fine_upstreams: List[FineGrainedLineageClass] = []
+ for upstream_urn, edge in self._lineage_map[downstream_urn].items():
+ upstreams.append(edge.gen_upstream_aspect())
+ fine_upstreams.extend(edge.gen_fine_grained_lineage_aspects())
+
+ upstream_lineage = UpstreamLineageClass(
+ upstreams=sorted(upstreams, key=lambda x: x.dataset),
+ fineGrainedLineages=sorted(
+ fine_upstreams,
+ key=lambda x: (x.downstreams, x.upstreams),
+ )
+ or None,
+ )
+ yield MetadataChangeProposalWrapper(
+ entityUrn=downstream_urn, aspect=upstream_lineage
+ ).as_workunit()
+
+ def _gen_usage_statistics_workunits(self) -> Iterable[MetadataWorkUnit]:
+ yield from self._usage_aggregator.generate_workunits(
+ resource_urn_builder=lambda urn: urn, user_urn_builder=lambda urn: urn
+ )
+
+
+def _merge_lineage_data(
+ downstream_urn: DatasetUrn,
+ *,
+ upstream_urns: Collection[DatasetUrn],
+ column_lineage: Optional[List[ColumnLineageInfo]],
+ upstream_edges: Dict[DatasetUrn, LineageEdge],
+ query_timestamp: Optional[datetime],
+ is_view_ddl: bool,
+ user: Optional[UserUrn],
+) -> None:
+ for upstream_urn in upstream_urns:
+ edge = upstream_edges.setdefault(
+ upstream_urn,
+ LineageEdge(
+ downstream_urn=downstream_urn,
+ upstream_urn=upstream_urn,
+ audit_stamp=query_timestamp,
+ actor=user,
+ type=DatasetLineageTypeClass.VIEW
+ if is_view_ddl
+ else DatasetLineageTypeClass.TRANSFORMED,
+ ),
+ )
+ if query_timestamp and ( # Use the most recent query
+ edge.audit_stamp is None or query_timestamp > edge.audit_stamp
+ ):
+ edge.audit_stamp = query_timestamp
+ if user:
+ edge.actor = user
+
+ # Note: Inefficient as we loop through all column_lineage entries for each downstream table
+ for cl in column_lineage or []:
+ if cl.downstream.table == downstream_urn:
+ for upstream_column_info in cl.upstreams:
+ if upstream_column_info.table not in upstream_urns:
+ continue
+ column_map = upstream_edges[upstream_column_info.table].column_map
+ column_map[cl.downstream.column].add(upstream_column_info.column)
+
+
+def _compute_upstream_fields(
+ result: SqlParsingResult,
+) -> Dict[DatasetUrn, Set[DatasetUrn]]:
+ upstream_fields: Dict[DatasetUrn, Set[DatasetUrn]] = defaultdict(set)
+ for cl in result.column_lineage or []:
+ for upstream in cl.upstreams:
+ upstream_fields[upstream.table].add(upstream.column)
+ return upstream_fields
+
+
+def _gen_operation_workunit(
+ result: SqlParsingResult,
+ *,
+ downstream_urn: DatasetUrn,
+ query_timestamp: datetime,
+ user: Optional[UserUrn],
+ custom_operation_type: Optional[str],
+) -> Iterable[MetadataWorkUnit]:
+ operation_type = result.query_type.to_operation_type()
+ # Filter out SELECT and other undesired statements
+ if operation_type is None:
+ return
+ elif operation_type == OperationTypeClass.UNKNOWN:
+ if custom_operation_type is None:
+ return
+ else:
+ operation_type = OperationTypeClass.CUSTOM
+
+ aspect = OperationClass(
+ timestampMillis=int(time.time() * 1000),
+ operationType=operation_type,
+ lastUpdatedTimestamp=int(query_timestamp.timestamp() * 1000),
+ actor=user,
+ customOperationType=custom_operation_type,
+ )
+ yield MetadataChangeProposalWrapper(
+ entityUrn=downstream_urn, aspect=aspect
+ ).as_workunit()
diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py
index 8e313e92cbf841..c943b83a887edb 100644
--- a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py
+++ b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py
@@ -435,6 +435,7 @@ def _field_from_complex_type(
field_path._set_parent_type_if_not_exists(
DataHubType(type=MapTypeClass, nested_type=value_type)
)
+ # FIXME: description not set. This is present in schema["description"].
yield from JsonSchemaTranslator.get_fields(
JsonSchemaTranslator._get_type_from_schema(
schema["additionalProperties"]
diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py
index 243c1848279c74..50ea69b6c13a99 100644
--- a/metadata-ingestion/src/datahub/ingestion/graph/client.py
+++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py
@@ -7,7 +7,7 @@
from dataclasses import dataclass
from datetime import datetime
from json.decoder import JSONDecodeError
-from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Type
from avro.schema import RecordSchema
from deprecated import deprecated
@@ -38,6 +38,8 @@
SystemMetadataClass,
TelemetryClientIdClass,
)
+from datahub.utilities.perf_timer import PerfTimer
+from datahub.utilities.urns.dataset_urn import DatasetUrn
from datahub.utilities.urns.urn import Urn, guess_entity_type
if TYPE_CHECKING:
@@ -957,7 +959,11 @@ def delete_references_to_urn(
@functools.lru_cache()
def _make_schema_resolver(
- self, platform: str, platform_instance: Optional[str], env: str
+ self,
+ platform: str,
+ platform_instance: Optional[str],
+ env: str,
+ include_graph: bool = True,
) -> "SchemaResolver":
from datahub.utilities.sqlglot_lineage import SchemaResolver
@@ -965,8 +971,50 @@ def _make_schema_resolver(
platform=platform,
platform_instance=platform_instance,
env=env,
- graph=self,
+ graph=self if include_graph else None,
+ )
+
+ def initialize_schema_resolver_from_datahub(
+ self, platform: str, platform_instance: Optional[str], env: str
+ ) -> Tuple["SchemaResolver", Set[str]]:
+ logger.info("Initializing schema resolver")
+
+ # TODO: Filter on platform instance?
+ logger.info(f"Fetching urns for platform {platform}, env {env}")
+ with PerfTimer() as timer:
+ urns = set(
+ self.get_urns_by_filter(
+ entity_types=[DatasetUrn.ENTITY_TYPE],
+ platform=platform,
+ env=env,
+ batch_size=3000,
+ )
+ )
+ logger.info(
+ f"Fetched {len(urns)} urns in {timer.elapsed_seconds()} seconds"
+ )
+
+ schema_resolver = self._make_schema_resolver(
+ platform, platform_instance, env, include_graph=False
)
+ with PerfTimer() as timer:
+ count = 0
+ for i, urn in enumerate(urns):
+ if i % 1000 == 0:
+ logger.debug(f"Loaded {i} schema metadata")
+ try:
+ schema_metadata = self.get_aspect(urn, SchemaMetadataClass)
+ if schema_metadata:
+ schema_resolver.add_schema_metadata(urn, schema_metadata)
+ count += 1
+ except Exception:
+ logger.warning("Failed to load schema metadata", exc_info=True)
+ logger.info(
+ f"Loaded {count} schema metadata in {timer.elapsed_seconds()} seconds"
+ )
+
+ logger.info("Finished initializing schema resolver")
+ return schema_resolver, urns
def parse_sql_lineage(
self,
@@ -982,9 +1030,7 @@ def parse_sql_lineage(
# Cache the schema resolver to make bulk parsing faster.
schema_resolver = self._make_schema_resolver(
- platform=platform,
- platform_instance=platform_instance,
- env=env,
+ platform=platform, platform_instance=platform_instance, env=env
)
return sqlglot_lineage(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index 7725d63ce0e1ed..1446812c292169 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -429,7 +429,9 @@ def get_dataplatform_instance_aspect(
) -> MetadataWorkUnit:
aspect = DataPlatformInstanceClass(
platform=make_data_platform_urn(self.platform),
- instance=make_dataplatform_instance_urn(self.platform, project_id),
+ instance=make_dataplatform_instance_urn(self.platform, project_id)
+ if self.config.include_data_platform_instance
+ else None,
)
return MetadataChangeProposalWrapper(
entityUrn=dataset_urn, aspect=aspect
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
index e5730ee87daf4d..0f2082c5e53bf2 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -81,6 +81,13 @@ class BigQueryV2Config(
description="Whether to populate BigQuery Console url to Datasets/Tables",
)
+ include_data_platform_instance: bool = Field(
+ default=False,
+ description="Whether to create a DataPlatformInstance aspect, equal to the BigQuery project id."
+ " If enabled, will cause redundancy in the browse path for BigQuery entities in the UI,"
+ " because the project id is represented as the top-level container.",
+ )
+
debug_include_full_payloads: bool = Field(
default=False,
description="Include full payload into events. It is only for debugging and internal use.",
diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py
index c8a4c7a6ab8fa4..b3fa5e3401c078 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py
@@ -626,12 +626,17 @@ def _extract_lineages(self):
@dataclass
class DebeziumSourceConnector:
connector_manifest: ConnectorManifest
+ report: KafkaConnectSourceReport
def __init__(
- self, connector_manifest: ConnectorManifest, config: KafkaConnectSourceConfig
+ self,
+ connector_manifest: ConnectorManifest,
+ config: KafkaConnectSourceConfig,
+ report: KafkaConnectSourceReport,
) -> None:
self.connector_manifest = connector_manifest
self.config = config
+ self.report = report
self._extract_lineages()
@dataclass
@@ -683,10 +688,19 @@ def get_parser(
database_name=connector_manifest.config.get("database.dbname"),
)
elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector":
+ database_name = connector_manifest.config.get(
+ "database.names"
+ ) or connector_manifest.config.get("database.dbname")
+
+ if "," in str(database_name):
+ raise Exception(
+ f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}"
+ )
+
parser = self.DebeziumParser(
source_platform="mssql",
server_name=self.get_server_name(connector_manifest),
- database_name=connector_manifest.config.get("database.dbname"),
+ database_name=database_name,
)
elif connector_class == "io.debezium.connector.db2.Db2Connector":
parser = self.DebeziumParser(
@@ -707,29 +721,37 @@ def get_parser(
def _extract_lineages(self):
lineages: List[KafkaConnectLineage] = list()
- parser = self.get_parser(self.connector_manifest)
- source_platform = parser.source_platform
- server_name = parser.server_name
- database_name = parser.database_name
- topic_naming_pattern = r"({0})\.(\w+\.\w+)".format(server_name)
- if not self.connector_manifest.topic_names:
- return lineages
+ try:
+ parser = self.get_parser(self.connector_manifest)
+ source_platform = parser.source_platform
+ server_name = parser.server_name
+ database_name = parser.database_name
+ topic_naming_pattern = r"({0})\.(\w+\.\w+)".format(server_name)
- for topic in self.connector_manifest.topic_names:
- found = re.search(re.compile(topic_naming_pattern), topic)
+ if not self.connector_manifest.topic_names:
+ return lineages
- if found:
- table_name = get_dataset_name(database_name, found.group(2))
+ for topic in self.connector_manifest.topic_names:
+ found = re.search(re.compile(topic_naming_pattern), topic)
- lineage = KafkaConnectLineage(
- source_dataset=table_name,
- source_platform=source_platform,
- target_dataset=topic,
- target_platform=KAFKA,
- )
- lineages.append(lineage)
- self.connector_manifest.lineages = lineages
+ if found:
+ table_name = get_dataset_name(database_name, found.group(2))
+
+ lineage = KafkaConnectLineage(
+ source_dataset=table_name,
+ source_platform=source_platform,
+ target_dataset=topic,
+ target_platform=KAFKA,
+ )
+ lineages.append(lineage)
+ self.connector_manifest.lineages = lineages
+ except Exception as e:
+ self.report.report_warning(
+ self.connector_manifest.name, f"Error resolving lineage: {e}"
+ )
+
+ return
@dataclass
@@ -1061,7 +1083,9 @@ def get_connectors_manifest(self) -> List[ConnectorManifest]:
"io.debezium.connector"
):
connector_manifest = DebeziumSourceConnector(
- connector_manifest=connector_manifest, config=self.config
+ connector_manifest=connector_manifest,
+ config=self.config,
+ report=self.report,
).connector_manifest
elif (
connector_manifest.config.get(CONNECTOR_CLASS, "")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/ldap.py b/metadata-ingestion/src/datahub/ingestion/source/ldap.py
index 497b49acb65055..e1d035a96d42fe 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/ldap.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ldap.py
@@ -271,10 +271,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
if dn is None:
continue
- if not attrs:
+ if not attrs or "objectClass" not in attrs:
self.report.report_warning(
"",
- f"skipping {dn} because attrs is empty; check your permissions if this is unexpected",
+ f"skipping {dn} because attrs ({attrs}) does not contain expected data; "
+ f"check your permissions if this is unexpected",
)
continue
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
index d568ddcb02afa2..40b90d216348c7 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
@@ -34,6 +34,7 @@
from datahub.ingestion.source.state.stale_entity_removal_handler import (
StaleEntityRemovalSourceReport,
)
+from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
DatasetLineageTypeClass,
FineGrainedLineageDownstreamType,
@@ -76,6 +77,8 @@
from datahub.utilities.lossy_collections import LossyList, LossySet
from datahub.utilities.url_util import remove_port_from_url
+CORPUSER_DATAHUB = "urn:li:corpuser:datahub"
+
if TYPE_CHECKING:
from datahub.ingestion.source.looker.lookml_source import (
LookerViewFileLoader,
@@ -786,6 +789,7 @@ def _to_metadata_events( # noqa: C901
if self.upstream_views is not None:
assert self.project_name is not None
upstreams = []
+ observed_lineage_ts = datetime.datetime.now(tz=datetime.timezone.utc)
for view_ref in sorted(self.upstream_views):
view_urn = LookerViewId(
project_name=view_ref.project
@@ -799,6 +803,10 @@ def _to_metadata_events( # noqa: C901
UpstreamClass(
dataset=view_urn,
type=DatasetLineageTypeClass.VIEW,
+ auditStamp=AuditStamp(
+ time=int(observed_lineage_ts.timestamp() * 1000),
+ actor=CORPUSER_DATAHUB,
+ ),
)
)
view_name_to_urn_map[view_ref.include] = view_urn
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py
index 362b4e5530638e..1a32afa2b7fdd6 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py
@@ -6,7 +6,7 @@
import re
import tempfile
from dataclasses import dataclass, field as dataclass_field, replace
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
from typing import (
Any,
ClassVar,
@@ -50,6 +50,7 @@
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
from datahub.ingestion.source.git.git_import import GitClone
from datahub.ingestion.source.looker.looker_common import (
+ CORPUSER_DATAHUB,
LookerCommonConfig,
LookerExplore,
LookerUtil,
@@ -83,6 +84,7 @@
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
from datahub.metadata.schema_classes import (
+ AuditStampClass,
DatasetPropertiesClass,
FineGrainedLineageClass,
FineGrainedLineageUpstreamTypeClass,
@@ -1615,11 +1617,16 @@ def _get_upstream_lineage(
# Generate the upstream + fine grained lineage objects.
upstreams = []
+ observed_lineage_ts = datetime.now(tz=timezone.utc)
fine_grained_lineages: List[FineGrainedLineageClass] = []
for upstream_dataset_urn in upstream_dataset_urns:
upstream = UpstreamClass(
dataset=upstream_dataset_urn,
type=DatasetLineageTypeClass.VIEW,
+ auditStamp=AuditStampClass(
+ time=int(observed_lineage_ts.timestamp() * 1000),
+ actor=CORPUSER_DATAHUB,
+ ),
)
upstreams.append(upstream)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py
index 31d067f984d2d6..ffa685fb258267 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py
@@ -121,6 +121,12 @@ class DataPlatformPair:
powerbi_data_platform_name: str
+@dataclass
+class PowerBIPlatformDetail:
+ data_platform_pair: DataPlatformPair
+ data_platform_server: str
+
+
class SupportedDataPlatform(Enum):
POSTGRES_SQL = DataPlatformPair(
powerbi_data_platform_name="PostgreSQL", datahub_data_platform_name="postgres"
@@ -382,6 +388,15 @@ class PowerBiDashboardSourceConfig(
description="The instance of the platform that all assets produced by this recipe belong to",
)
+ # Enable advance sql construct
+ enable_advance_lineage_sql_construct: bool = pydantic.Field(
+ default=False,
+ description="Whether to enable advance native sql construct for parsing like join, sub-queries. "
+ "along this flag , the native_query_parsing should be enabled. "
+ "By default convert_lineage_urns_to_lowercase is enabled, in-case if you have disabled it in previous ingestion execution then it may break lineage "
+ "as this option generates the upstream datasets URN in lowercase.",
+ )
+
@validator("dataset_type_mapping")
@classmethod
def map_data_platform(cls, value):
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py
index 396da2d79e3b76..baaa8d5b85ae10 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py
@@ -5,8 +5,8 @@
from datahub.ingestion.source.powerbi.config import (
PlatformDetail,
PowerBiDashboardSourceConfig,
+ PowerBIPlatformDetail,
)
-from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable
logger = logging.getLogger(__name__)
@@ -14,7 +14,7 @@
class AbstractDataPlatformInstanceResolver(ABC):
@abstractmethod
def get_platform_instance(
- self, dataplatform_table: DataPlatformTable
+ self, data_platform_detail: PowerBIPlatformDetail
) -> PlatformDetail:
pass
@@ -32,10 +32,10 @@ class ResolvePlatformInstanceFromDatasetTypeMapping(
BaseAbstractDataPlatformInstanceResolver
):
def get_platform_instance(
- self, dataplatform_table: DataPlatformTable
+ self, data_platform_detail: PowerBIPlatformDetail
) -> PlatformDetail:
platform: Union[str, PlatformDetail] = self.config.dataset_type_mapping[
- dataplatform_table.data_platform_pair.powerbi_data_platform_name
+ data_platform_detail.data_platform_pair.powerbi_data_platform_name
]
if isinstance(platform, PlatformDetail):
@@ -48,13 +48,13 @@ class ResolvePlatformInstanceFromServerToPlatformInstance(
BaseAbstractDataPlatformInstanceResolver
):
def get_platform_instance(
- self, dataplatform_table: DataPlatformTable
+ self, data_platform_detail: PowerBIPlatformDetail
) -> PlatformDetail:
return (
self.config.server_to_platform_instance[
- dataplatform_table.datasource_server
+ data_platform_detail.data_platform_server
]
- if dataplatform_table.datasource_server
+ if data_platform_detail.data_platform_server
in self.config.server_to_platform_instance
else PlatformDetail.parse_obj({})
)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py
index 640bc4bd60d80f..021c429c3c6333 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py
@@ -1,8 +1,12 @@
import logging
-from typing import List
+from typing import List, Optional
import sqlparse
+import datahub.utilities.sqlglot_lineage as sqlglot_l
+from datahub.ingestion.api.common import PipelineContext
+from datahub.utilities.sqlglot_lineage import SqlParsingResult
+
SPECIAL_CHARACTERS = ["#(lf)", "(lf)"]
logger = logging.getLogger()
@@ -45,3 +49,30 @@ def get_tables(native_query: str) -> List[str]:
from_index = from_index + 1
return tables
+
+
+def parse_custom_sql(
+ ctx: PipelineContext,
+ query: str,
+ schema: Optional[str],
+ database: Optional[str],
+ platform: str,
+ env: str,
+ platform_instance: Optional[str],
+) -> Optional["SqlParsingResult"]:
+
+ logger.debug("Using sqlglot_lineage to parse custom sql")
+
+ sql_query = remove_special_characters(query)
+
+ logger.debug(f"Parsing sql={sql_query}")
+
+ return sqlglot_l.create_lineage_sql_parsed_result(
+ query=sql_query,
+ schema=schema,
+ database=database,
+ platform=platform,
+ platform_instance=platform_instance,
+ env=env,
+ graph=ctx.graph,
+ )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py
index 83106c04529d18..8cc38c366c42a4 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py
@@ -6,7 +6,14 @@
import lark
from lark import Lark, Tree
-from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport
+from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.source.powerbi.config import (
+ PowerBiDashboardSourceConfig,
+ PowerBiDashboardSourceReport,
+)
+from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
+ AbstractDataPlatformInstanceResolver,
+)
from datahub.ingestion.source.powerbi.m_query import resolver, validator
from datahub.ingestion.source.powerbi.m_query.data_classes import (
TRACE_POWERBI_MQUERY_PARSER,
@@ -45,7 +52,9 @@ def _parse_expression(expression: str) -> Tree:
def get_upstream_tables(
table: Table,
reporter: PowerBiDashboardSourceReport,
- native_query_enabled: bool = True,
+ platform_instance_resolver: AbstractDataPlatformInstanceResolver,
+ ctx: PipelineContext,
+ config: PowerBiDashboardSourceConfig,
parameters: Dict[str, str] = {},
) -> List[resolver.DataPlatformTable]:
if table.expression is None:
@@ -58,7 +67,7 @@ def get_upstream_tables(
parse_tree: Tree = _parse_expression(table.expression)
valid, message = validator.validate_parse_tree(
- parse_tree, native_query_enabled=native_query_enabled
+ parse_tree, native_query_enabled=config.native_query_parsing
)
if valid is False:
assert message is not None
@@ -84,7 +93,11 @@ def get_upstream_tables(
parse_tree=parse_tree,
reporter=reporter,
parameters=parameters,
- ).resolve_to_data_platform_table_list()
+ ).resolve_to_data_platform_table_list(
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
+ )
except BaseException as e:
reporter.report_warning(table.full_name, "Failed to process m-query expression")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py
index e2b448124c89d9..479f1decff903d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py
@@ -6,11 +6,19 @@
from lark import Tree
+import datahub.emitter.mce_builder as builder
+from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.source.powerbi.config import (
DataPlatformPair,
+ PlatformDetail,
+ PowerBiDashboardSourceConfig,
PowerBiDashboardSourceReport,
+ PowerBIPlatformDetail,
SupportedDataPlatform,
)
+from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
+ AbstractDataPlatformInstanceResolver,
+)
from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function
from datahub.ingestion.source.powerbi.m_query.data_classes import (
TRACE_POWERBI_MQUERY_PARSER,
@@ -19,19 +27,98 @@
IdentifierAccessor,
)
from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
+from datahub.utilities.sqlglot_lineage import SqlParsingResult
logger = logging.getLogger(__name__)
@dataclass
class DataPlatformTable:
- name: str
- full_name: str
- datasource_server: str
data_platform_pair: DataPlatformPair
+ urn: str
+
+
+def urn_to_lowercase(value: str, flag: bool) -> str:
+ if flag is True:
+ return value.lower()
+
+ return value
+
+
+def urn_creator(
+ config: PowerBiDashboardSourceConfig,
+ platform_instance_resolver: AbstractDataPlatformInstanceResolver,
+ data_platform_pair: DataPlatformPair,
+ server: str,
+ qualified_table_name: str,
+) -> str:
+
+ platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance(
+ PowerBIPlatformDetail(
+ data_platform_pair=data_platform_pair,
+ data_platform_server=server,
+ )
+ )
+
+ return builder.make_dataset_urn_with_platform_instance(
+ platform=data_platform_pair.datahub_data_platform_name,
+ platform_instance=platform_detail.platform_instance,
+ env=platform_detail.env,
+ name=urn_to_lowercase(
+ qualified_table_name, config.convert_lineage_urns_to_lowercase
+ ),
+ )
class AbstractDataPlatformTableCreator(ABC):
+ """
+ Base class to share common functionalities among different dataplatform for M-Query parsing.
+
+ To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and
+ the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example see below M-Query.
+
+ let
+ Source = Sql.Database("localhost", "library"),
+ dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]
+ in
+ dbo_book_issue
+
+ It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in second argument
+ of first statement and schema-name and table-name is available in second statement. second statement can be repeated to access different tables from MSSQL.
+
+ DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern
+
+ data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to
+ find out database-name , schema-name and table-name also varies as per dataplatform.
+
+ Value.NativeQuery is one of the function which is used to execute native query inside M-Query, for example see below M-Query
+
+ let
+ Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true])
+ in
+ Source
+
+ In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query.
+
+ NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing.
+
+ """
+
+ ctx: PipelineContext
+ config: PowerBiDashboardSourceConfig
+ platform_instance_resolver: AbstractDataPlatformInstanceResolver
+
+ def __init__(
+ self,
+ ctx: PipelineContext,
+ config: PowerBiDashboardSourceConfig,
+ platform_instance_resolver: AbstractDataPlatformInstanceResolver,
+ ) -> None:
+ super().__init__()
+ self.ctx = ctx
+ self.config = config
+ self.platform_instance_resolver = platform_instance_resolver
+
@abstractmethod
def create_dataplatform_tables(
self, data_access_func_detail: DataAccessFunctionDetail
@@ -58,6 +145,49 @@ def get_db_detail_from_argument(
return arguments[0], arguments[1]
+ def parse_custom_sql(
+ self, query: str, server: str, database: Optional[str], schema: Optional[str]
+ ) -> List[DataPlatformTable]:
+
+ dataplatform_tables: List[DataPlatformTable] = []
+
+ platform_detail: PlatformDetail = (
+ self.platform_instance_resolver.get_platform_instance(
+ PowerBIPlatformDetail(
+ data_platform_pair=self.get_platform_pair(),
+ data_platform_server=server,
+ )
+ )
+ )
+
+ parsed_result: Optional[
+ "SqlParsingResult"
+ ] = native_sql_parser.parse_custom_sql(
+ ctx=self.ctx,
+ query=query,
+ platform=self.get_platform_pair().datahub_data_platform_name,
+ platform_instance=platform_detail.platform_instance,
+ env=platform_detail.env,
+ database=database,
+ schema=schema,
+ )
+
+ if parsed_result is None:
+ logger.debug("Failed to parse query")
+ return dataplatform_tables
+
+ for urn in parsed_result.in_tables:
+ dataplatform_tables.append(
+ DataPlatformTable(
+ data_platform_pair=self.get_platform_pair(),
+ urn=urn,
+ )
+ )
+
+ logger.debug(f"Generated dataplatform_tables={dataplatform_tables}")
+
+ return dataplatform_tables
+
class AbstractDataAccessMQueryResolver(ABC):
table: Table
@@ -80,11 +210,29 @@ def __init__(
self.data_access_functions = SupportedResolver.get_function_names()
@abstractmethod
- def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]:
+ def resolve_to_data_platform_table_list(
+ self,
+ ctx: PipelineContext,
+ config: PowerBiDashboardSourceConfig,
+ platform_instance_resolver: AbstractDataPlatformInstanceResolver,
+ ) -> List[DataPlatformTable]:
pass
class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
+ """
+ This class parses the M-Query recursively to generate DataAccessFunctionDetail (see method create_data_access_functional_detail).
+
+ This class has generic code to process M-Query tokens and create instance of DataAccessFunctionDetail.
+
+ Once DataAccessFunctionDetail instance is initialized thereafter MQueryResolver generates the DataPlatformTable with the help of AbstractDataPlatformTableCreator
+ (see method resolve_to_data_platform_table_list).
+
+ Classes which extended from AbstractDataPlatformTableCreator knows how to convert generated DataAccessFunctionDetail instance
+ to respective DataPlatformTable instance as per dataplatform.
+
+ """
+
def get_item_selector_tokens(
self,
expression_tree: Tree,
@@ -318,9 +466,15 @@ def internal(
return table_links
- def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]:
+ def resolve_to_data_platform_table_list(
+ self,
+ ctx: PipelineContext,
+ config: PowerBiDashboardSourceConfig,
+ platform_instance_resolver: AbstractDataPlatformInstanceResolver,
+ ) -> List[DataPlatformTable]:
data_platform_tables: List[DataPlatformTable] = []
+ # Find out output variable as we are doing backtracking in M-Query
output_variable: Optional[str] = tree_function.get_output_variable(
self.parse_tree
)
@@ -332,12 +486,14 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]:
)
return data_platform_tables
+ # Parse M-Query and use output_variable as root of tree and create instance of DataAccessFunctionDetail
table_links: List[
DataAccessFunctionDetail
] = self.create_data_access_functional_detail(output_variable)
# Each item is data-access function
for f_detail in table_links:
+ # Get & Check if we support data-access-function available in M-Query
supported_resolver = SupportedResolver.get_resolver(
f_detail.data_access_function_name
)
@@ -351,8 +507,14 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]:
)
continue
+ # From supported_resolver enum get respective resolver like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it
+ # & also pass additional information that will be need to generate urn
table_full_name_creator: AbstractDataPlatformTableCreator = (
- supported_resolver.get_table_full_name_creator()()
+ supported_resolver.get_table_full_name_creator()(
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
+ )
)
data_platform_tables.extend(
@@ -393,18 +555,24 @@ def two_level_access_pattern(
IdentifierAccessor, data_access_func_detail.identifier_accessor
).items["Item"]
- full_table_name: str = f"{db_name}.{schema_name}.{table_name}"
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
logger.debug(
- f"Platform({self.get_platform_pair().datahub_data_platform_name}) full_table_name= {full_table_name}"
+ f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}"
+ )
+
+ urn = urn_creator(
+ config=self.config,
+ platform_instance_resolver=self.platform_instance_resolver,
+ data_platform_pair=self.get_platform_pair(),
+ server=server,
+ qualified_table_name=qualified_table_name,
)
return [
DataPlatformTable(
- name=table_name,
- full_name=full_table_name,
- datasource_server=server,
data_platform_pair=self.get_platform_pair(),
+ urn=urn,
)
]
@@ -420,9 +588,48 @@ def get_platform_pair(self) -> DataPlatformPair:
class MSSqlDataPlatformTableCreator(DefaultTwoStepDataAccessSources):
+ # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16
+ DEFAULT_SCHEMA = "dbo" # Default schema name in MS-SQL is dbo
+
def get_platform_pair(self) -> DataPlatformPair:
return SupportedDataPlatform.MS_SQL.value
+ def create_urn_using_old_parser(
+ self, query: str, db_name: str, server: str
+ ) -> List[DataPlatformTable]:
+ dataplatform_tables: List[DataPlatformTable] = []
+
+ tables: List[str] = native_sql_parser.get_tables(query)
+
+ for table in tables:
+ schema_and_table: List[str] = table.split(".")
+ if len(schema_and_table) == 1:
+ # schema name is not present. set default schema
+ schema_and_table.insert(0, MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA)
+
+ qualified_table_name = (
+ f"{db_name}.{schema_and_table[0]}.{schema_and_table[1]}"
+ )
+
+ urn = urn_creator(
+ config=self.config,
+ platform_instance_resolver=self.platform_instance_resolver,
+ data_platform_pair=self.get_platform_pair(),
+ server=server,
+ qualified_table_name=qualified_table_name,
+ )
+
+ dataplatform_tables.append(
+ DataPlatformTable(
+ data_platform_pair=self.get_platform_pair(),
+ urn=urn,
+ )
+ )
+
+ logger.debug(f"Generated upstream tables = {dataplatform_tables}")
+
+ return dataplatform_tables
+
def create_dataplatform_tables(
self, data_access_func_detail: DataAccessFunctionDetail
) -> List[DataPlatformTable]:
@@ -442,28 +649,20 @@ def create_dataplatform_tables(
logger.debug("Unsupported case is found. Second index is not the Query")
return dataplatform_tables
- db_name: str = arguments[1]
-
- tables: List[str] = native_sql_parser.get_tables(arguments[3])
- for table in tables:
- schema_and_table: List[str] = table.split(".")
- if len(schema_and_table) == 1:
- # schema name is not present. Default schema name in MS-SQL is dbo
- # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16
- schema_and_table.insert(0, "dbo")
-
- dataplatform_tables.append(
- DataPlatformTable(
- name=schema_and_table[1],
- full_name=f"{db_name}.{schema_and_table[0]}.{schema_and_table[1]}",
- datasource_server=arguments[0],
- data_platform_pair=self.get_platform_pair(),
- )
+ if self.config.enable_advance_lineage_sql_construct is False:
+ # Use previous parser to generate URN to keep backward compatibility
+ return self.create_urn_using_old_parser(
+ query=arguments[3],
+ db_name=arguments[1],
+ server=arguments[0],
)
- logger.debug("MS-SQL full-table-names %s", dataplatform_tables)
-
- return dataplatform_tables
+ return self.parse_custom_sql(
+ query=arguments[3],
+ database=arguments[1],
+ server=arguments[0],
+ schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA,
+ )
class OracleDataPlatformTableCreator(AbstractDataPlatformTableCreator):
@@ -510,12 +709,20 @@ def create_dataplatform_tables(
cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
).items["Name"]
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
+
+ urn = urn_creator(
+ config=self.config,
+ platform_instance_resolver=self.platform_instance_resolver,
+ data_platform_pair=self.get_platform_pair(),
+ server=server,
+ qualified_table_name=qualified_table_name,
+ )
+
return [
DataPlatformTable(
- name=table_name,
- full_name=f"{db_name}.{schema_name}.{table_name}",
- datasource_server=server,
data_platform_pair=self.get_platform_pair(),
+ urn=urn,
)
]
@@ -547,14 +754,28 @@ def create_dataplatform_tables(
db_name: str = value_dict["Database"]
schema_name: str = value_dict["Schema"]
table_name: str = value_dict["Table"]
+
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
+
server, _ = self.get_db_detail_from_argument(data_access_func_detail.arg_list)
+ if server is None:
+ logger.info(
+ f"server information is not available for {qualified_table_name}. Skipping upstream table"
+ )
+ return []
+
+ urn = urn_creator(
+ config=self.config,
+ platform_instance_resolver=self.platform_instance_resolver,
+ data_platform_pair=self.get_platform_pair(),
+ server=server,
+ qualified_table_name=qualified_table_name,
+ )
return [
DataPlatformTable(
- name=table_name,
- full_name=f"{db_name}.{schema_name}.{table_name}",
- datasource_server=server if server else "",
data_platform_pair=self.get_platform_pair(),
+ urn=urn,
)
]
@@ -589,20 +810,26 @@ def create_dataplatform_tables(
IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore
).items["Name"]
- full_table_name: str = f"{db_name}.{schema_name}.{table_name}"
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
logger.debug(
- f"{self.get_platform_pair().datahub_data_platform_name} full-table-name {full_table_name}"
+ f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}"
+ )
+
+ server: str = self.get_datasource_server(arguments, data_access_func_detail)
+
+ urn = urn_creator(
+ config=self.config,
+ platform_instance_resolver=self.platform_instance_resolver,
+ data_platform_pair=self.get_platform_pair(),
+ server=server,
+ qualified_table_name=qualified_table_name,
)
return [
DataPlatformTable(
- name=table_name,
- full_name=full_table_name,
- datasource_server=self.get_datasource_server(
- arguments, data_access_func_detail
- ),
data_platform_pair=self.get_platform_pair(),
+ urn=urn,
)
]
@@ -654,12 +881,20 @@ def create_dataplatform_tables(
cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
).items["Name"]
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
+
+ urn = urn_creator(
+ config=self.config,
+ platform_instance_resolver=self.platform_instance_resolver,
+ data_platform_pair=self.get_platform_pair(),
+ server=server,
+ qualified_table_name=qualified_table_name,
+ )
+
return [
DataPlatformTable(
- name=table_name,
- full_name=f"{db_name}.{schema_name}.{table_name}",
- datasource_server=server,
data_platform_pair=self.get_platform_pair(),
+ urn=urn,
)
]
@@ -681,6 +916,39 @@ def is_native_parsing_supported(data_access_function_name: str) -> bool:
in NativeQueryDataPlatformTableCreator.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM
)
+ def create_urn_using_old_parser(
+ self, query: str, server: str
+ ) -> List[DataPlatformTable]:
+ dataplatform_tables: List[DataPlatformTable] = []
+
+ tables: List[str] = native_sql_parser.get_tables(query)
+
+ for qualified_table_name in tables:
+ if len(qualified_table_name.split(".")) != 3:
+ logger.debug(
+ f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format"
+ )
+ continue
+
+ urn = urn_creator(
+ config=self.config,
+ platform_instance_resolver=self.platform_instance_resolver,
+ data_platform_pair=self.get_platform_pair(),
+ server=server,
+ qualified_table_name=qualified_table_name,
+ )
+
+ dataplatform_tables.append(
+ DataPlatformTable(
+ data_platform_pair=self.get_platform_pair(),
+ urn=urn,
+ )
+ )
+
+ logger.debug(f"Generated dataplatform_tables {dataplatform_tables}")
+
+ return dataplatform_tables
+
def create_dataplatform_tables(
self, data_access_func_detail: DataAccessFunctionDetail
) -> List[DataPlatformTable]:
@@ -727,25 +995,21 @@ def create_dataplatform_tables(
0
] # Remove any whitespaces and double quotes character
- for table in native_sql_parser.get_tables(sql_query):
- if len(table.split(".")) != 3:
- logger.debug(
- f"Skipping table {table} as it is not as per full_table_name format"
- )
- continue
+ server = tree_function.strip_char_from_list([data_access_tokens[2]])[0]
- dataplatform_tables.append(
- DataPlatformTable(
- name=table.split(".")[2],
- full_name=table,
- datasource_server=tree_function.strip_char_from_list(
- [data_access_tokens[2]]
- )[0],
- data_platform_pair=self.get_platform_pair(),
- )
+ if self.config.enable_advance_lineage_sql_construct is False:
+ # Use previous parser to generate URN to keep backward compatibility
+ return self.create_urn_using_old_parser(
+ query=sql_query,
+ server=server,
)
- return dataplatform_tables
+ return self.parse_custom_sql(
+ query=sql_query,
+ server=server,
+ database=None, # database and schema is available inside custom sql as per PowerBI Behavior
+ schema=None,
+ )
class FunctionName(Enum):
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
index 919cb83e4d832c..5d477ee090e7e6 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
@@ -28,7 +28,6 @@
)
from datahub.ingestion.source.powerbi.config import (
Constant,
- PlatformDetail,
PowerBiDashboardSourceConfig,
PowerBiDashboardSourceReport,
)
@@ -96,10 +95,12 @@ def __hash__(self):
def __init__(
self,
+ ctx: PipelineContext,
config: PowerBiDashboardSourceConfig,
reporter: PowerBiDashboardSourceReport,
dataplatform_instance_resolver: AbstractDataPlatformInstanceResolver,
):
+ self.__ctx = ctx
self.__config = config
self.__reporter = reporter
self.__dataplatform_instance_resolver = dataplatform_instance_resolver
@@ -172,43 +173,40 @@ def extract_lineage(
# table.dataset should always be set, but we check it just in case.
parameters = table.dataset.parameters if table.dataset else {}
- upstreams: List[UpstreamClass] = []
- upstream_tables: List[resolver.DataPlatformTable] = parser.get_upstream_tables(
- table, self.__reporter, parameters=parameters
+ upstream: List[UpstreamClass] = []
+
+ upstream_dpts: List[resolver.DataPlatformTable] = parser.get_upstream_tables(
+ table=table,
+ reporter=self.__reporter,
+ platform_instance_resolver=self.__dataplatform_instance_resolver,
+ ctx=self.__ctx,
+ config=self.__config,
+ parameters=parameters,
)
+
logger.debug(
- f"PowerBI virtual table {table.full_name} and it's upstream dataplatform tables = {upstream_tables}"
+ f"PowerBI virtual table {table.full_name} and it's upstream dataplatform tables = {upstream_dpts}"
)
- for upstream_table in upstream_tables:
+
+ for upstream_dpt in upstream_dpts:
if (
- upstream_table.data_platform_pair.powerbi_data_platform_name
+ upstream_dpt.data_platform_pair.powerbi_data_platform_name
not in self.__config.dataset_type_mapping.keys()
):
logger.debug(
- f"Skipping upstream table for {ds_urn}. The platform {upstream_table.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping",
+ f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping",
)
continue
- platform_detail: PlatformDetail = (
- self.__dataplatform_instance_resolver.get_platform_instance(
- upstream_table
- )
- )
- upstream_urn = builder.make_dataset_urn_with_platform_instance(
- platform=upstream_table.data_platform_pair.datahub_data_platform_name,
- platform_instance=platform_detail.platform_instance,
- env=platform_detail.env,
- name=self.lineage_urn_to_lowercase(upstream_table.full_name),
- )
-
upstream_table_class = UpstreamClass(
- upstream_urn,
+ upstream_dpt.urn,
DatasetLineageTypeClass.TRANSFORMED,
)
- upstreams.append(upstream_table_class)
- if len(upstreams) > 0:
- upstream_lineage = UpstreamLineageClass(upstreams=upstreams)
+ upstream.append(upstream_table_class)
+
+ if len(upstream) > 0:
+ upstream_lineage = UpstreamLineageClass(upstreams=upstream)
logger.debug(f"Dataset urn = {ds_urn} and its lineage = {upstream_lineage}")
mcp = MetadataChangeProposalWrapper(
entityType=Constant.DATASET,
@@ -1107,7 +1105,9 @@ def __init__(self, config: PowerBiDashboardSourceConfig, ctx: PipelineContext):
) # Exit pipeline as we are not able to connect to PowerBI API Service. This exit will avoid raising
# unwanted stacktrace on console
- self.mapper = Mapper(config, self.reporter, self.dataplatform_instance_resolver)
+ self.mapper = Mapper(
+ ctx, config, self.reporter, self.dataplatform_instance_resolver
+ )
# Create and register the stateful ingestion use-case handler.
self.stale_entity_removal_handler = StaleEntityRemovalHandler.create(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py
index e8e80e172a9ce1..a7d946e99d8061 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py
@@ -1,10 +1,12 @@
import logging
+from collections import defaultdict
+from dataclasses import dataclass
from enum import Enum
-from typing import Dict, List, Optional, cast
+from typing import Dict, List, Optional, Set, cast
from pydantic import Field, SecretStr, root_validator, validator
-from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.common import AllowDenyPattern, ConfigModel
from datahub.configuration.pattern_utils import UUID_REGEX
from datahub.configuration.validate_field_removal import pydantic_removed_field
from datahub.configuration.validate_field_rename import pydantic_renamed_field
@@ -42,6 +44,31 @@ class TagOption(str, Enum):
skip = "skip"
+@dataclass(frozen=True)
+class DatabaseId:
+ database: str = Field(
+ description="Database created from share in consumer account."
+ )
+ platform_instance: str = Field(
+ description="Platform instance of consumer snowflake account."
+ )
+
+
+class SnowflakeShareConfig(ConfigModel):
+ database: str = Field(description="Database from which share is created.")
+ platform_instance: str = Field(
+ description="Platform instance for snowflake account in which share is created."
+ )
+
+ consumers: Set[DatabaseId] = Field(
+ description="List of databases created in consumer accounts."
+ )
+
+ @property
+ def source_database(self) -> DatabaseId:
+ return DatabaseId(self.database, self.platform_instance)
+
+
class SnowflakeV2Config(
SnowflakeConfig,
SnowflakeUsageConfig,
@@ -91,13 +118,8 @@ class SnowflakeV2Config(
description="Whether `schema_pattern` is matched against fully qualified schema name `.`.",
)
- use_legacy_lineage_method: bool = Field(
- default=False,
- description=(
- "Whether to use the legacy lineage computation method. "
- "By default, uses new optimised lineage extraction method that requires less ingestion process memory. "
- "Table-to-view and view-to-view column-level lineage are not supported with the legacy method."
- ),
+ _use_legacy_lineage_method_removed = pydantic_removed_field(
+ "use_legacy_lineage_method"
)
validate_upstreams_against_patterns: bool = Field(
@@ -113,13 +135,20 @@ class SnowflakeV2Config(
# This is required since access_history table does not capture whether the table was temporary table.
temporary_tables_pattern: List[str] = Field(
default=DEFAULT_TABLES_DENY_LIST,
- description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to match the entire table name in database.schema.table format. Defaults are to set in such a way to ignore the temporary staging tables created by known ETL tools. Not used if `use_legacy_lineage_method=True`",
+ description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to match the entire table name in database.schema.table format. Defaults are to set in such a way to ignore the temporary staging tables created by known ETL tools.",
)
rename_upstreams_deny_pattern_to_temporary_table_pattern = pydantic_renamed_field(
"upstreams_deny_pattern", "temporary_tables_pattern"
)
+ shares: Optional[Dict[str, SnowflakeShareConfig]] = Field(
+ default=None,
+ description="Required if current account owns or consumes snowflake share."
+ " If specified, connector creates lineage and siblings relationship between current account's database tables and consumer/producer account's database tables."
+ " Map of share name -> details of share.",
+ )
+
email_as_user_identifier: bool = Field(
default=True,
description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is provided, generates email addresses for snowflake users with unset emails, based on their username.",
@@ -197,3 +226,77 @@ def get_sql_alchemy_url(
@property
def parse_view_ddl(self) -> bool:
return self.include_view_column_lineage
+
+ @validator("shares")
+ def validate_shares(
+ cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict
+ ) -> Optional[Dict[str, SnowflakeShareConfig]]:
+ current_platform_instance = values.get("platform_instance")
+
+ if shares:
+ # Check: platform_instance should be present
+ assert current_platform_instance is not None, (
+ "Did you forget to set `platform_instance` for current ingestion ? "
+ "It is required to use `platform_instance` when ingesting from multiple snowflake accounts."
+ )
+
+ databases_included_in_share: List[DatabaseId] = []
+ databases_created_from_share: List[DatabaseId] = []
+
+ for share_details in shares.values():
+ shared_db = DatabaseId(
+ share_details.database, share_details.platform_instance
+ )
+ assert all(
+ consumer.platform_instance != share_details.platform_instance
+ for consumer in share_details.consumers
+ ), "Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake."
+
+ databases_included_in_share.append(shared_db)
+ databases_created_from_share.extend(share_details.consumers)
+
+ for db_from_share in databases_created_from_share:
+ assert (
+ db_from_share not in databases_included_in_share
+ ), "Database included in a share can not be present as consumer in any share."
+ assert (
+ databases_created_from_share.count(db_from_share) == 1
+ ), "Same database can not be present as consumer in more than one share."
+
+ return shares
+
+ def outbounds(self) -> Dict[str, Set[DatabaseId]]:
+ """
+ Returns mapping of
+ database included in current account's outbound share -> all databases created from this share in other accounts
+ """
+ outbounds: Dict[str, Set[DatabaseId]] = defaultdict(set)
+ if self.shares:
+ for share_name, share_details in self.shares.items():
+ if share_details.platform_instance == self.platform_instance:
+ logger.debug(
+ f"database {share_details.database} is included in outbound share(s) {share_name}."
+ )
+ outbounds[share_details.database].update(share_details.consumers)
+ return outbounds
+
+ def inbounds(self) -> Dict[str, DatabaseId]:
+ """
+ Returns mapping of
+ database created from an current account's inbound share -> other-account database from which this share was created
+ """
+ inbounds: Dict[str, DatabaseId] = {}
+ if self.shares:
+ for share_name, share_details in self.shares.items():
+ for consumer in share_details.consumers:
+ if consumer.platform_instance == self.platform_instance:
+ logger.debug(
+ f"database {consumer.database} is created from inbound share {share_name}."
+ )
+ inbounds[consumer.database] = share_details.source_database
+ break
+ else:
+ logger.info(
+ f"Skipping Share {share_name}, as it does not include current platform instance {self.platform_instance}",
+ )
+ return inbounds
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py
deleted file mode 100644
index 832a072c619f8a..00000000000000
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py
+++ /dev/null
@@ -1,664 +0,0 @@
-import json
-import logging
-from collections import defaultdict
-from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, FrozenSet, Iterable, List, Optional, Set
-
-from pydantic import Field
-from pydantic.error_wrappers import ValidationError
-from snowflake.connector import SnowflakeConnection
-
-import datahub.emitter.mce_builder as builder
-from datahub.emitter.mcp import MetadataChangeProposalWrapper
-from datahub.ingestion.api.workunit import MetadataWorkUnit
-from datahub.ingestion.source.aws.s3_util import make_s3_urn
-from datahub.ingestion.source.snowflake.constants import (
- LINEAGE_PERMISSION_ERROR,
- SnowflakeEdition,
- SnowflakeObjectDomain,
-)
-from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config
-from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
-from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
-from datahub.ingestion.source.snowflake.snowflake_usage_v2 import (
- SnowflakeColumnReference,
-)
-from datahub.ingestion.source.snowflake.snowflake_utils import (
- SnowflakeCommonMixin,
- SnowflakeConnectionMixin,
- SnowflakePermissionError,
- SnowflakeQueryMixin,
-)
-from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
- FineGrainedLineage,
- FineGrainedLineageDownstreamType,
- FineGrainedLineageUpstreamType,
- UpstreamLineage,
-)
-from datahub.metadata.schema_classes import DatasetLineageTypeClass, UpstreamClass
-from datahub.utilities.perf_timer import PerfTimer
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-
-class SnowflakeColumnWithLineage(SnowflakeColumnReference):
- class Config:
- # This is for backward compatibility and can be removed later
- allow_population_by_field_name = True
-
- directSourceColumns: Optional[List[SnowflakeColumnReference]] = Field(
- default=None, alias="directSources"
- )
-
-
-@dataclass(frozen=True)
-class SnowflakeColumnId:
- columnName: str
- objectName: str
- objectDomain: Optional[str] = None
-
-
-@dataclass(frozen=True)
-class SnowflakeColumnFineGrainedLineage:
- """
- Fie grained upstream of column,
- which represents a transformation applied on input columns"""
-
- inputColumns: FrozenSet[SnowflakeColumnId]
- # Transform function, query etc can be added here
-
-
-@dataclass
-class SnowflakeColumnUpstreams:
- """All upstreams of a column"""
-
- upstreams: Set[SnowflakeColumnFineGrainedLineage] = field(
- default_factory=set, init=False
- )
-
- def update_column_lineage(
- self, directSourceColumns: List[SnowflakeColumnReference]
- ) -> None:
- input_columns = frozenset(
- [
- SnowflakeColumnId(
- upstream_col.columnName,
- upstream_col.objectName,
- upstream_col.objectDomain,
- )
- for upstream_col in directSourceColumns
- if upstream_col.objectName
- ]
- )
- if not input_columns:
- return
- upstream = SnowflakeColumnFineGrainedLineage(inputColumns=input_columns)
- if upstream not in self.upstreams:
- self.upstreams.add(upstream)
-
-
-@dataclass
-class SnowflakeUpstreamTable:
- upstreamDataset: str
- upstreamColumns: List[SnowflakeColumnReference]
- downstreamColumns: List[SnowflakeColumnWithLineage]
-
- @classmethod
- def from_dict(
- cls,
- dataset: str,
- upstreams_columns_json: Optional[str],
- downstream_columns_json: Optional[str],
- ) -> "SnowflakeUpstreamTable":
- try:
- upstreams_columns_list = []
- downstream_columns_list = []
- if upstreams_columns_json is not None:
- upstreams_columns_list = json.loads(upstreams_columns_json)
- if downstream_columns_json is not None:
- downstream_columns_list = json.loads(downstream_columns_json)
-
- table_with_upstreams = cls(
- dataset,
- [
- SnowflakeColumnReference.parse_obj(col)
- for col in upstreams_columns_list
- ],
- [
- SnowflakeColumnWithLineage.parse_obj(col)
- for col in downstream_columns_list
- ],
- )
- except ValidationError:
- # Earlier versions of column lineage did not include columnName, only columnId
- table_with_upstreams = cls(dataset, [], [])
- return table_with_upstreams
-
-
-@dataclass
-class SnowflakeTableLineage:
- # key: upstream table name
- upstreamTables: Dict[str, SnowflakeUpstreamTable] = field(
- default_factory=dict, init=False
- )
-
- # key: downstream column name
- columnLineages: Dict[str, SnowflakeColumnUpstreams] = field(
- default_factory=lambda: defaultdict(SnowflakeColumnUpstreams), init=False
- )
-
- def update_lineage(
- self, table: SnowflakeUpstreamTable, include_column_lineage: bool = True
- ) -> None:
- if table.upstreamDataset not in self.upstreamTables.keys():
- self.upstreamTables[table.upstreamDataset] = table
-
- if include_column_lineage and table.downstreamColumns:
- for col in table.downstreamColumns:
- if col.directSourceColumns:
- self.columnLineages[col.columnName].update_column_lineage(
- col.directSourceColumns
- )
-
-
-class SnowflakeLineageExtractor(
- SnowflakeQueryMixin, SnowflakeConnectionMixin, SnowflakeCommonMixin
-):
- """
- Extracts Lineage from Snowflake.
- Following lineage edges are considered.
-
- 1. "Table to View" lineage via `snowflake.account_usage.object_dependencies` view
- 2. "S3 to Table" lineage via `show external tables` query.
- 3. "View to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above)
- 4. "Table to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above)
- 5. "S3 to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above)
-
- Edition Note - Snowflake Standard Edition does not have Access History Feature. So it does not support lineage extraction for edges 3, 4, 5 mentioned above.
- """
-
- def __init__(
- self,
- config: SnowflakeV2Config,
- report: SnowflakeV2Report,
- dataset_urn_builder: Callable[[str], str],
- ) -> None:
- self._lineage_map: Dict[str, SnowflakeTableLineage] = defaultdict(
- SnowflakeTableLineage
- )
- self._external_lineage_map: Dict[str, Set[str]] = defaultdict(set)
- self.config = config
- self.report = report
- self.logger = logger
- self.dataset_urn_builder = dataset_urn_builder
- self.connection: Optional[SnowflakeConnection] = None
-
- # Kwargs used by new snowflake lineage extractor need to be ignored here
- def get_workunits(
- self, discovered_tables: List[str], discovered_views: List[str], **_kwargs: Any
- ) -> Iterable[MetadataWorkUnit]:
- self.connection = self.create_connection()
- if self.connection is None:
- return
-
- self._populate_table_lineage()
-
- if self.config.include_view_lineage:
- if len(discovered_views) > 0:
- self._populate_view_lineage()
- else:
- logger.info("No views found. Skipping View Lineage Extraction.")
-
- self._populate_external_lineage()
-
- if (
- len(self._lineage_map.keys()) == 0
- and len(self._external_lineage_map.keys()) == 0
- ):
- logger.debug("No lineage found.")
- return
-
- yield from self.get_table_upstream_workunits(discovered_tables)
- yield from self.get_view_upstream_workunits(discovered_views)
-
- def _populate_table_lineage(self):
- if self.report.edition == SnowflakeEdition.STANDARD:
- logger.info(
- "Snowflake Account is Standard Edition. Table to Table Lineage Feature is not supported."
- ) # See Edition Note above for why
- else:
- with PerfTimer() as timer:
- self._populate_lineage()
- self.report.table_lineage_query_secs = timer.elapsed_seconds()
-
- def get_table_upstream_workunits(self, discovered_tables):
- if self.config.include_table_lineage:
- for dataset_name in discovered_tables:
- upstream_lineage = self._get_upstream_lineage_info(dataset_name)
- if upstream_lineage is not None:
- yield MetadataChangeProposalWrapper(
- entityUrn=self.dataset_urn_builder(dataset_name),
- aspect=upstream_lineage,
- ).as_workunit()
-
- def get_view_upstream_workunits(self, discovered_views):
- if self.config.include_view_lineage:
- for view_name in discovered_views:
- upstream_lineage = self._get_upstream_lineage_info(view_name)
- if upstream_lineage is not None:
- yield MetadataChangeProposalWrapper(
- entityUrn=self.dataset_urn_builder(view_name),
- aspect=upstream_lineage,
- ).as_workunit()
-
- def _get_upstream_lineage_info(
- self, dataset_name: str
- ) -> Optional[UpstreamLineage]:
- lineage = self._lineage_map[dataset_name]
- external_lineage = self._external_lineage_map[dataset_name]
- if not (lineage.upstreamTables or lineage.columnLineages or external_lineage):
- logger.debug(f"No lineage found for {dataset_name}")
- return None
-
- upstream_tables: List[UpstreamClass] = []
- finegrained_lineages: List[FineGrainedLineage] = []
-
- # Populate the table-lineage in aspect
- self.update_upstream_tables_lineage(upstream_tables, lineage)
-
- # Populate the column-lineage in aspect
- self.update_upstream_columns_lineage(
- self.dataset_urn_builder(dataset_name), finegrained_lineages, lineage
- )
-
- # Populate the external-table-lineage(s3->snowflake) in aspect
- self.update_external_tables_lineage(upstream_tables, external_lineage)
-
- if len(upstream_tables) > 0:
- logger.debug(
- f"Upstream lineage of '{dataset_name}': {[u.dataset for u in upstream_tables]}"
- )
- if self.config.upstream_lineage_in_report:
- self.report.upstream_lineage[dataset_name] = [
- u.dataset for u in upstream_tables
- ]
- return UpstreamLineage(
- upstreams=upstream_tables,
- fineGrainedLineages=sorted(
- finegrained_lineages, key=lambda x: (x.downstreams, x.upstreams)
- )
- or None,
- )
- else:
- return None
-
- def _populate_view_lineage(self) -> None:
- with PerfTimer() as timer:
- self._populate_view_upstream_lineage()
- self.report.view_upstream_lineage_query_secs = timer.elapsed_seconds()
-
- if self.report.edition == SnowflakeEdition.STANDARD:
- logger.info(
- "Snowflake Account is Standard Edition. View to Table Lineage Feature is not supported."
- ) # See Edition Note above for why
- else:
- with PerfTimer() as timer:
- self._populate_view_downstream_lineage()
- self.report.view_downstream_lineage_query_secs = timer.elapsed_seconds()
-
- def _populate_external_lineage(self) -> None:
- with PerfTimer() as timer:
- self.report.num_external_table_edges_scanned = 0
-
- if self.report.edition == SnowflakeEdition.STANDARD:
- logger.info(
- "Snowflake Account is Standard Edition. External Lineage Feature via Access History is not supported."
- ) # See Edition Note above for why
- else:
- self._populate_external_lineage_from_access_history()
-
- self._populate_external_lineage_from_show_query()
-
- logger.info(
- f"Found {self.report.num_external_table_edges_scanned} external lineage edges."
- )
-
- self.report.external_lineage_queries_secs = timer.elapsed_seconds()
-
- # Handles the case for explicitly created external tables.
- # NOTE: Snowflake does not log this information to the access_history table.
- def _populate_external_lineage_from_show_query(self):
- external_tables_query: str = SnowflakeQuery.show_external_tables()
- try:
- for db_row in self.query(external_tables_query):
- key = self.get_dataset_identifier(
- db_row["name"], db_row["schema_name"], db_row["database_name"]
- )
-
- if not self._is_dataset_pattern_allowed(
- key, SnowflakeObjectDomain.TABLE
- ):
- continue
- self._external_lineage_map[key].add(db_row["location"])
- logger.debug(
- f"ExternalLineage[Table(Down)={key}]:External(Up)={self._external_lineage_map[key]} via show external tables"
- )
- self.report.num_external_table_edges_scanned += 1
- except Exception as e:
- logger.debug(e, exc_info=e)
- self.report_warning(
- "external_lineage",
- f"Populating external table lineage from Snowflake failed due to error {e}.",
- )
-
- # Handles the case where a table is populated from an external location via copy.
- # Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...') pattern='.*.csv';
- def _populate_external_lineage_from_access_history(self):
- query: str = SnowflakeQuery.external_table_lineage_history(
- start_time_millis=int(self.config.start_time.timestamp() * 1000)
- if not self.config.ignore_start_time_lineage
- else 0,
- end_time_millis=int(self.config.end_time.timestamp() * 1000),
- )
-
- try:
- for db_row in self.query(query):
- self._process_external_lineage_result_row(db_row)
- except Exception as e:
- if isinstance(e, SnowflakePermissionError):
- error_msg = "Failed to get external lineage. Please grant imported privileges on SNOWFLAKE database. "
- self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg)
- else:
- logger.debug(e, exc_info=e)
- self.report_warning(
- "external_lineage",
- f"Populating table external lineage from Snowflake failed due to error {e}.",
- )
-
- def _process_external_lineage_result_row(self, db_row):
- # key is the down-stream table name
- key: str = self.get_dataset_identifier_from_qualified_name(
- db_row["DOWNSTREAM_TABLE_NAME"]
- )
- if not self._is_dataset_pattern_allowed(key, SnowflakeObjectDomain.TABLE):
- return
-
- if db_row["UPSTREAM_LOCATIONS"] is not None:
- external_locations = json.loads(db_row["UPSTREAM_LOCATIONS"])
-
- for loc in external_locations:
- if loc not in self._external_lineage_map[key]:
- self._external_lineage_map[key].add(loc)
- self.report.num_external_table_edges_scanned += 1
-
- logger.debug(
- f"ExternalLineage[Table(Down)={key}]:External(Up)={self._external_lineage_map[key]} via access_history"
- )
-
- def _populate_lineage(self) -> None:
- query: str = SnowflakeQuery.table_to_table_lineage_history(
- start_time_millis=int(self.config.start_time.timestamp() * 1000)
- if not self.config.ignore_start_time_lineage
- else 0,
- end_time_millis=int(self.config.end_time.timestamp() * 1000),
- include_column_lineage=self.config.include_column_lineage,
- )
- self.report.num_table_to_table_edges_scanned = 0
- try:
- for db_row in self.query(query):
- self._process_table_lineage_row(db_row)
- except Exception as e:
- if isinstance(e, SnowflakePermissionError):
- error_msg = "Failed to get table to table lineage. Please grant imported privileges on SNOWFLAKE database. "
- self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg)
- else:
- logger.debug(e, exc_info=e)
- self.report_warning(
- "table-lineage",
- f"Extracting lineage from Snowflake failed due to error {e}.",
- )
- logger.info(
- f"A total of {self.report.num_table_to_table_edges_scanned} Table->Table edges found"
- f" for {len(self._lineage_map)} downstream tables.",
- )
-
- def _process_table_lineage_row(self, db_row):
- # key is the down-stream table name
- key: str = self.get_dataset_identifier_from_qualified_name(
- db_row["DOWNSTREAM_TABLE_NAME"]
- )
- upstream_table_name = self.get_dataset_identifier_from_qualified_name(
- db_row["UPSTREAM_TABLE_NAME"]
- )
- if not self._is_dataset_pattern_allowed(
- key, SnowflakeObjectDomain.TABLE
- ) or not (
- self._is_dataset_pattern_allowed(
- upstream_table_name, SnowflakeObjectDomain.TABLE, is_upstream=True
- )
- ):
- return
- self._lineage_map[key].update_lineage(
- # (, , )
- SnowflakeUpstreamTable.from_dict(
- upstream_table_name,
- db_row["UPSTREAM_TABLE_COLUMNS"],
- db_row["DOWNSTREAM_TABLE_COLUMNS"],
- ),
- self.config.include_column_lineage,
- )
- self.report.num_table_to_table_edges_scanned += 1
- logger.debug(f"Lineage[Table(Down)={key}]:Table(Up)={self._lineage_map[key]}")
-
- def _populate_view_upstream_lineage(self) -> None:
- # NOTE: This query captures only the upstream lineage of a view (with no column lineage).
- # For more details see: https://docs.snowflake.com/en/user-guide/object-dependencies.html#object-dependencies
- # and also https://docs.snowflake.com/en/sql-reference/account-usage/access_history.html#usage-notes for current limitations on capturing the lineage for views.
- view_upstream_lineage_query: str = SnowflakeQuery.view_dependencies()
-
- self.report.num_table_to_view_edges_scanned = 0
-
- try:
- for db_row in self.query(view_upstream_lineage_query):
- self._process_view_upstream_lineage_row(db_row)
- except Exception as e:
- if isinstance(e, SnowflakePermissionError):
- error_msg = "Failed to get table to view lineage. Please grant imported privileges on SNOWFLAKE database."
- self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg)
- else:
- logger.debug(e, exc_info=e)
- self.report_warning(
- "view-upstream-lineage",
- f"Extracting the upstream view lineage from Snowflake failed due to error {e}.",
- )
- logger.info(
- f"A total of {self.report.num_table_to_view_edges_scanned} View upstream edges found."
- )
-
- def _process_view_upstream_lineage_row(self, db_row):
- # Process UpstreamTable/View/ExternalTable/Materialized View->View edge.
- view_upstream: str = self.get_dataset_identifier_from_qualified_name(
- db_row["VIEW_UPSTREAM"]
- )
- view_name: str = self.get_dataset_identifier_from_qualified_name(
- db_row["DOWNSTREAM_VIEW"]
- )
-
- if not self._is_dataset_pattern_allowed(
- dataset_name=view_name,
- dataset_type=db_row["REFERENCING_OBJECT_DOMAIN"],
- ) or not self._is_dataset_pattern_allowed(
- view_upstream, db_row["REFERENCED_OBJECT_DOMAIN"], is_upstream=True
- ):
- return
- # key is the downstream view name
- self._lineage_map[view_name].update_lineage(
- # (, , )
- SnowflakeUpstreamTable.from_dict(view_upstream, None, None),
- self.config.include_column_lineage,
- )
- self.report.num_table_to_view_edges_scanned += 1
- logger.debug(
- f"Upstream->View: Lineage[View(Down)={view_name}]:Upstream={view_upstream}"
- )
-
- def _populate_view_downstream_lineage(self) -> None:
- # This query captures the downstream table lineage for views.
- # See https://docs.snowflake.com/en/sql-reference/account-usage/access_history.html#usage-notes for current limitations on capturing the lineage for views.
- # Eg: For viewA->viewB->ViewC->TableD, snowflake does not yet log intermediate view logs, resulting in only the viewA->TableD edge.
- view_lineage_query: str = SnowflakeQuery.view_lineage_history(
- start_time_millis=int(self.config.start_time.timestamp() * 1000)
- if not self.config.ignore_start_time_lineage
- else 0,
- end_time_millis=int(self.config.end_time.timestamp() * 1000),
- include_column_lineage=self.config.include_column_lineage,
- )
-
- self.report.num_view_to_table_edges_scanned = 0
-
- try:
- for db_row in self.query(view_lineage_query):
- self._process_view_downstream_lineage_row(db_row)
- except Exception as e:
- if isinstance(e, SnowflakePermissionError):
- error_msg = "Failed to get view to table lineage. Please grant imported privileges on SNOWFLAKE database. "
- self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg)
- else:
- logger.debug(e, exc_info=e)
- self.report_warning(
- "view-downstream-lineage",
- f"Extracting the view lineage from Snowflake failed due to error {e}.",
- )
-
- logger.info(
- f"Found {self.report.num_view_to_table_edges_scanned} View->Table edges."
- )
-
- def _process_view_downstream_lineage_row(self, db_row):
- view_name: str = self.get_dataset_identifier_from_qualified_name(
- db_row["VIEW_NAME"]
- )
- downstream_table: str = self.get_dataset_identifier_from_qualified_name(
- db_row["DOWNSTREAM_TABLE_NAME"]
- )
- if not self._is_dataset_pattern_allowed(
- view_name, db_row["VIEW_DOMAIN"], is_upstream=True
- ) or not self._is_dataset_pattern_allowed(
- downstream_table, db_row["DOWNSTREAM_TABLE_DOMAIN"]
- ):
- return
-
- # Capture view->downstream table lineage.
- self._lineage_map[downstream_table].update_lineage(
- # (, , )
- SnowflakeUpstreamTable.from_dict(
- view_name,
- db_row["VIEW_COLUMNS"],
- db_row["DOWNSTREAM_TABLE_COLUMNS"],
- ),
- self.config.include_column_lineage,
- )
- self.report.num_view_to_table_edges_scanned += 1
-
- logger.debug(
- f"View->Table: Lineage[Table(Down)={downstream_table}]:View(Up)={self._lineage_map[downstream_table]}"
- )
-
- def update_upstream_tables_lineage(
- self, upstream_tables: List[UpstreamClass], lineage: SnowflakeTableLineage
- ) -> None:
- for lineage_entry in sorted(
- lineage.upstreamTables.values(), key=lambda x: x.upstreamDataset
- ):
- upstream_table_name = lineage_entry.upstreamDataset
- upstream_table = UpstreamClass(
- dataset=self.dataset_urn_builder(upstream_table_name),
- type=DatasetLineageTypeClass.TRANSFORMED,
- )
- upstream_tables.append(upstream_table)
-
- def update_upstream_columns_lineage(
- self,
- dataset_urn: str,
- finegrained_lineages: List[FineGrainedLineage],
- lineage: SnowflakeTableLineage,
- ) -> None:
- # For every column for which upstream lineage is available
- for col, col_upstreams in lineage.columnLineages.items():
- # For every upstream of column
- self.update_upstream_columns_lineage_of_column(
- dataset_urn, col, finegrained_lineages, col_upstreams
- )
-
- def update_upstream_columns_lineage_of_column(
- self,
- dataset_urn: str,
- col: str,
- finegrained_lineages: List[FineGrainedLineage],
- col_upstreams: SnowflakeColumnUpstreams,
- ) -> None:
- for fine_upstream in col_upstreams.upstreams:
- finegrained_lineage_entry = self.build_finegrained_lineage(
- dataset_urn, col, fine_upstream
- )
- if finegrained_lineage_entry.upstreams:
- finegrained_lineages.append(finegrained_lineage_entry)
-
- def build_finegrained_lineage(
- self,
- dataset_urn: str,
- col: str,
- fine_upstream: SnowflakeColumnFineGrainedLineage,
- ) -> FineGrainedLineage:
- fieldPath = col
-
- column_upstreams = self.build_finegrained_lineage_upstreams(fine_upstream)
- finegrained_lineage_entry = FineGrainedLineage(
- upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
- # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend
- # even if the lineage is same but the order is different.
- upstreams=sorted(column_upstreams),
- downstreamType=FineGrainedLineageDownstreamType.FIELD,
- downstreams=[
- builder.make_schema_field_urn(
- dataset_urn, self.snowflake_identifier(fieldPath)
- )
- ],
- )
-
- return finegrained_lineage_entry
-
- def build_finegrained_lineage_upstreams(
- self, fine_upstream: SnowflakeColumnFineGrainedLineage
- ) -> List[str]:
- column_upstreams = []
- for upstream_col in fine_upstream.inputColumns:
- if (
- upstream_col.objectName
- and upstream_col.columnName
- and self._is_dataset_pattern_allowed(
- upstream_col.objectName, upstream_col.objectDomain, is_upstream=True
- )
- ):
- upstream_dataset_name = self.get_dataset_identifier_from_qualified_name(
- upstream_col.objectName
- )
- column_upstreams.append(
- builder.make_schema_field_urn(
- self.dataset_urn_builder(upstream_dataset_name),
- self.snowflake_identifier(upstream_col.columnName),
- )
- )
- return column_upstreams
-
- def update_external_tables_lineage(
- self, upstream_tables: List[UpstreamClass], external_lineage: Set[str]
- ) -> None:
- for external_lineage_entry in sorted(external_lineage):
- # For now, populate only for S3
- if external_lineage_entry.startswith("s3://"):
- external_upstream_table = UpstreamClass(
- dataset=make_s3_urn(external_lineage_entry, self.config.env),
- type=DatasetLineageTypeClass.COPY,
- )
- upstream_tables.append(external_upstream_table)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py
index 587c71a98be67f..0f89324f5efc60 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py
@@ -506,35 +506,6 @@ def view_dependencies_v2() -> str:
def show_external_tables() -> str:
return "show external tables in account"
- # Note - This method should be removed once legacy lineage is removed
- @staticmethod
- def external_table_lineage_history(
- start_time_millis: int, end_time_millis: int
- ) -> str:
- return f"""
- WITH external_table_lineage_history AS (
- SELECT
- r.value:"locations" AS upstream_locations,
- w.value:"objectName"::varchar AS downstream_table_name,
- w.value:"objectDomain"::varchar AS downstream_table_domain,
- w.value:"columns" AS downstream_table_columns,
- t.query_start_time AS query_start_time
- FROM
- (SELECT * from snowflake.account_usage.access_history) t,
- lateral flatten(input => t.BASE_OBJECTS_ACCESSED) r,
- lateral flatten(input => t.OBJECTS_MODIFIED) w
- WHERE r.value:"locations" IS NOT NULL
- AND w.value:"objectId" IS NOT NULL
- AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
- AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3))
- SELECT
- upstream_locations AS "UPSTREAM_LOCATIONS",
- downstream_table_name AS "DOWNSTREAM_TABLE_NAME",
- downstream_table_columns AS "DOWNSTREAM_TABLE_COLUMNS"
- FROM external_table_lineage_history
- WHERE downstream_table_domain = '{SnowflakeObjectDomain.TABLE.capitalize()}'
- QUALIFY ROW_NUMBER() OVER (PARTITION BY downstream_table_name ORDER BY query_start_time DESC) = 1"""
-
@staticmethod
def copy_lineage_history(
start_time_millis: int,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py
index dab46645bffcc7..e5b214ba35e4b6 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py
@@ -261,6 +261,7 @@ def get_tables_for_database(
for table in cur:
if table["TABLE_SCHEMA"] not in tables:
tables[table["TABLE_SCHEMA"]] = []
+
tables[table["TABLE_SCHEMA"]].append(
SnowflakeTable(
name=table["TABLE_NAME"],
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py
new file mode 100644
index 00000000000000..6f7520bbf1988a
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py
@@ -0,0 +1,158 @@
+import logging
+from typing import Callable, Iterable, List
+
+from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.snowflake.snowflake_config import (
+ DatabaseId,
+ SnowflakeV2Config,
+)
+from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
+from datahub.ingestion.source.snowflake.snowflake_schema import SnowflakeDatabase
+from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin
+from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
+from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
+ DatasetLineageType,
+ Upstream,
+ UpstreamLineage,
+)
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class SnowflakeSharesHandler(SnowflakeCommonMixin):
+ def __init__(
+ self,
+ config: SnowflakeV2Config,
+ report: SnowflakeV2Report,
+ dataset_urn_builder: Callable[[str], str],
+ ) -> None:
+ self.config = config
+ self.report = report
+ self.logger = logger
+ self.dataset_urn_builder = dataset_urn_builder
+
+ def get_shares_workunits(
+ self, databases: List[SnowflakeDatabase]
+ ) -> Iterable[MetadataWorkUnit]:
+ inbounds = self.config.inbounds()
+ outbounds = self.config.outbounds()
+ # None of the databases are shared
+ if not (inbounds or outbounds):
+ return
+
+ logger.debug("Checking databases for inbound or outbound shares.")
+ for db in databases:
+ is_inbound = db.name in inbounds
+ is_outbound = db.name in outbounds
+
+ if not (is_inbound or is_outbound):
+ logger.debug(f"database {db.name} is not shared.")
+ continue
+
+ sibling_dbs = (
+ list(outbounds[db.name]) if is_outbound else [inbounds[db.name]]
+ )
+
+ for schema in db.schemas:
+ for table_name in schema.tables + schema.views:
+ # TODO: If this is outbound database,
+ # 1. attempt listing shares using `show shares` to identify name of share associated with this database (cache query result).
+ # 2. if corresponding share is listed, then run `show grants to share ` to identify exact tables, views included in share.
+ # 3. emit siblings only for the objects listed above.
+ # This will work only if the configured role has accountadmin role access OR is owner of share.
+ # Otherwise ghost nodes may be shown in "Composed Of" section for tables/views in original database which are not granted to share.
+ yield from self.gen_siblings(
+ db.name,
+ schema.name,
+ table_name,
+ is_outbound,
+ sibling_dbs,
+ )
+
+ if is_inbound:
+ assert len(sibling_dbs) == 1
+ # SnowflakeLineageExtractor is unaware of database->schema->table hierarchy
+ # hence this lineage code is not written in SnowflakeLineageExtractor
+ # also this is not governed by configs include_table_lineage and include_view_lineage
+ yield self.get_upstream_lineage_with_primary_sibling(
+ db.name, schema.name, table_name, sibling_dbs[0]
+ )
+
+ self.report_missing_databases(
+ databases, list(inbounds.keys()), list(outbounds.keys())
+ )
+
+ def report_missing_databases(
+ self,
+ databases: List[SnowflakeDatabase],
+ inbounds: List[str],
+ outbounds: List[str],
+ ) -> None:
+ db_names = [db.name for db in databases]
+ missing_dbs = [db for db in inbounds + outbounds if db not in db_names]
+
+ if missing_dbs:
+ self.report_warning(
+ "snowflake-shares",
+ f"Databases {missing_dbs} were not ingested. Siblings/Lineage will not be set for these.",
+ )
+
+ def gen_siblings(
+ self,
+ database_name: str,
+ schema_name: str,
+ table_name: str,
+ primary: bool,
+ sibling_databases: List[DatabaseId],
+ ) -> Iterable[MetadataWorkUnit]:
+ if not sibling_databases:
+ return
+ dataset_identifier = self.get_dataset_identifier(
+ table_name, schema_name, database_name
+ )
+ urn = self.dataset_urn_builder(dataset_identifier)
+
+ sibling_urns = [
+ make_dataset_urn_with_platform_instance(
+ self.platform,
+ self.get_dataset_identifier(
+ table_name, schema_name, sibling_db.database
+ ),
+ sibling_db.platform_instance,
+ )
+ for sibling_db in sibling_databases
+ ]
+
+ yield MetadataChangeProposalWrapper(
+ entityUrn=urn,
+ aspect=Siblings(primary=primary, siblings=sorted(sibling_urns)),
+ ).as_workunit()
+
+ def get_upstream_lineage_with_primary_sibling(
+ self,
+ database_name: str,
+ schema_name: str,
+ table_name: str,
+ primary_sibling_db: DatabaseId,
+ ) -> MetadataWorkUnit:
+ dataset_identifier = self.get_dataset_identifier(
+ table_name, schema_name, database_name
+ )
+ urn = self.dataset_urn_builder(dataset_identifier)
+
+ upstream_urn = make_dataset_urn_with_platform_instance(
+ self.platform,
+ self.get_dataset_identifier(
+ table_name, schema_name, primary_sibling_db.database
+ ),
+ primary_sibling_db.platform_instance,
+ )
+
+ return MetadataChangeProposalWrapper(
+ entityUrn=urn,
+ aspect=UpstreamLineage(
+ upstreams=[Upstream(dataset=upstream_urn, type=DatasetLineageType.COPY)]
+ ),
+ ).as_workunit()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
index 7dd51d5b20e8e7..2cb4b37fdd696e 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
@@ -51,9 +51,6 @@
SnowflakeV2Config,
TagOption,
)
-from datahub.ingestion.source.snowflake.snowflake_lineage_legacy import (
- SnowflakeLineageExtractor as SnowflakeLineageLegacyExtractor,
-)
from datahub.ingestion.source.snowflake.snowflake_lineage_v2 import (
SnowflakeLineageExtractor,
)
@@ -71,6 +68,7 @@
SnowflakeTag,
SnowflakeView,
)
+from datahub.ingestion.source.snowflake.snowflake_shares import SnowflakeSharesHandler
from datahub.ingestion.source.snowflake.snowflake_tag import SnowflakeTagExtractor
from datahub.ingestion.source.snowflake.snowflake_usage_v2 import (
SnowflakeUsageExtractor,
@@ -240,19 +238,10 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config):
# For database, schema, tables, views, etc
self.data_dictionary = SnowflakeDataDictionary()
- self.lineage_extractor: Union[
- SnowflakeLineageExtractor, SnowflakeLineageLegacyExtractor
- ]
if config.include_table_lineage:
- # For lineage
- if self.config.use_legacy_lineage_method:
- self.lineage_extractor = SnowflakeLineageLegacyExtractor(
- config, self.report, dataset_urn_builder=self.gen_dataset_urn
- )
- else:
- self.lineage_extractor = SnowflakeLineageExtractor(
- config, self.report, dataset_urn_builder=self.gen_dataset_urn
- )
+ self.lineage_extractor = SnowflakeLineageExtractor(
+ config, self.report, dataset_urn_builder=self.gen_dataset_urn
+ )
if config.include_usage_stats or config.include_operational_stats:
self.usage_extractor = SnowflakeUsageExtractor(
@@ -503,9 +492,16 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
return
self.data_dictionary.set_connection(self.connection)
- databases = self.get_databases()
+ databases: List[SnowflakeDatabase] = []
- if databases is None or len(databases) == 0:
+ for database in self.get_databases() or []:
+ self.report.report_entity_scanned(database.name, "database")
+ if not self.config.database_pattern.allowed(database.name):
+ self.report.report_dropped(f"{database.name}.*")
+ else:
+ databases.append(database)
+
+ if len(databases) == 0:
return
for snowflake_db in databases:
@@ -532,25 +528,22 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
# TODO: The checkpoint state for stale entity detection can be committed here.
+ if self.config.shares:
+ yield from SnowflakeSharesHandler(
+ self.config, self.report, self.gen_dataset_urn
+ ).get_shares_workunits(databases)
+
discovered_tables: List[str] = [
self.get_dataset_identifier(table_name, schema.name, db.name)
for db in databases
for schema in db.schemas
for table_name in schema.tables
- if self._is_dataset_pattern_allowed(
- self.get_dataset_identifier(table_name, schema.name, db.name),
- SnowflakeObjectDomain.TABLE,
- )
]
discovered_views: List[str] = [
self.get_dataset_identifier(table_name, schema.name, db.name)
for db in databases
for schema in db.schemas
for table_name in schema.views
- if self._is_dataset_pattern_allowed(
- self.get_dataset_identifier(table_name, schema.name, db.name),
- SnowflakeObjectDomain.VIEW,
- )
]
if len(discovered_tables) == 0 and len(discovered_views) == 0:
@@ -654,11 +647,6 @@ def get_databases_from_ischema(self, databases):
def _process_database(
self, snowflake_db: SnowflakeDatabase
) -> Iterable[MetadataWorkUnit]:
- self.report.report_entity_scanned(snowflake_db.name, "database")
- if not self.config.database_pattern.allowed(snowflake_db.name):
- self.report.report_dropped(f"{snowflake_db.name}.*")
- return
-
db_name = snowflake_db.name
try:
@@ -704,11 +692,22 @@ def _process_database(
if self.config.is_profiling_enabled() and self.db_tables:
yield from self.profiler.get_workunits(snowflake_db, self.db_tables)
- def fetch_schemas_for_database(self, snowflake_db, db_name):
+ def fetch_schemas_for_database(
+ self, snowflake_db: SnowflakeDatabase, db_name: str
+ ) -> None:
+ schemas: List[SnowflakeSchema] = []
try:
- snowflake_db.schemas = self.data_dictionary.get_schemas_for_database(
- db_name
- )
+ for schema in self.data_dictionary.get_schemas_for_database(db_name):
+ self.report.report_entity_scanned(schema.name, "schema")
+ if not is_schema_allowed(
+ self.config.schema_pattern,
+ schema.name,
+ db_name,
+ self.config.match_fully_qualified_names,
+ ):
+ self.report.report_dropped(f"{db_name}.{schema.name}.*")
+ else:
+ schemas.append(schema)
except Exception as e:
if isinstance(e, SnowflakePermissionError):
error_msg = f"Failed to get schemas for database {db_name}. Please check permissions."
@@ -724,25 +723,17 @@ def fetch_schemas_for_database(self, snowflake_db, db_name):
db_name,
)
- if not snowflake_db.schemas:
+ if not schemas:
self.report_warning(
"No schemas found in database. If schemas exist, please grant USAGE permissions on them.",
db_name,
)
+ else:
+ snowflake_db.schemas = schemas
def _process_schema(
self, snowflake_schema: SnowflakeSchema, db_name: str
) -> Iterable[MetadataWorkUnit]:
- self.report.report_entity_scanned(snowflake_schema.name, "schema")
- if not is_schema_allowed(
- self.config.schema_pattern,
- snowflake_schema.name,
- db_name,
- self.config.match_fully_qualified_names,
- ):
- self.report.report_dropped(f"{db_name}.{snowflake_schema.name}.*")
- return
-
schema_name = snowflake_schema.name
if self.config.extract_tags != TagOption.skip:
@@ -784,9 +775,20 @@ def _process_schema(
f"{db_name}.{schema_name}",
)
- def fetch_views_for_schema(self, snowflake_schema, db_name, schema_name):
+ def fetch_views_for_schema(
+ self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str
+ ) -> List[SnowflakeView]:
try:
- views = self.get_views_for_schema(schema_name, db_name)
+ views: List[SnowflakeView] = []
+ for view in self.get_views_for_schema(schema_name, db_name):
+ view_name = self.get_dataset_identifier(view.name, schema_name, db_name)
+
+ self.report.report_entity_scanned(view_name, "view")
+
+ if not self.config.view_pattern.allowed(view_name):
+ self.report.report_dropped(view_name)
+ else:
+ views.append(view)
snowflake_schema.views = [view.name for view in views]
return views
except Exception as e:
@@ -804,10 +806,22 @@ def fetch_views_for_schema(self, snowflake_schema, db_name, schema_name):
"Failed to get views for schema",
f"{db_name}.{schema_name}",
)
+ return []
- def fetch_tables_for_schema(self, snowflake_schema, db_name, schema_name):
+ def fetch_tables_for_schema(
+ self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str
+ ) -> List[SnowflakeTable]:
try:
- tables = self.get_tables_for_schema(schema_name, db_name)
+ tables: List[SnowflakeTable] = []
+ for table in self.get_tables_for_schema(schema_name, db_name):
+ table_identifier = self.get_dataset_identifier(
+ table.name, schema_name, db_name
+ )
+ self.report.report_entity_scanned(table_identifier)
+ if not self.config.table_pattern.allowed(table_identifier):
+ self.report.report_dropped(table_identifier)
+ else:
+ tables.append(table)
snowflake_schema.tables = [table.name for table in tables]
return tables
except Exception as e:
@@ -824,6 +838,7 @@ def fetch_tables_for_schema(self, snowflake_schema, db_name, schema_name):
"Failed to get tables for schema",
f"{db_name}.{schema_name}",
)
+ return []
def _process_table(
self,
@@ -833,12 +848,6 @@ def _process_table(
) -> Iterable[MetadataWorkUnit]:
table_identifier = self.get_dataset_identifier(table.name, schema_name, db_name)
- self.report.report_entity_scanned(table_identifier)
-
- if not self.config.table_pattern.allowed(table_identifier):
- self.report.report_dropped(table_identifier)
- return
-
self.fetch_columns_for_table(table, schema_name, db_name, table_identifier)
self.fetch_pk_for_table(table, schema_name, db_name, table_identifier)
@@ -950,12 +959,6 @@ def _process_view(
) -> Iterable[MetadataWorkUnit]:
view_name = self.get_dataset_identifier(view.name, schema_name, db_name)
- self.report.report_entity_scanned(view_name, "view")
-
- if not self.config.view_pattern.allowed(view_name):
- self.report.report_dropped(view_name)
- return
-
try:
view.columns = self.get_columns_for_table(view.name, schema_name, db_name)
if self.config.extract_tags != TagOption.skip:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py
deleted file mode 100644
index a9afd40fd45b68..00000000000000
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py
+++ /dev/null
@@ -1,278 +0,0 @@
-import logging
-import urllib.parse
-from typing import Any, Dict, Iterable, List, Optional, Tuple
-
-import pydantic
-import sqlalchemy.dialects.mssql
-
-# This import verifies that the dependencies are available.
-import sqlalchemy_pytds # noqa: F401
-from pydantic.fields import Field
-from sqlalchemy import create_engine, inspect
-from sqlalchemy.engine.base import Connection
-from sqlalchemy.engine.reflection import Inspector
-
-from datahub.configuration.common import AllowDenyPattern
-from datahub.ingestion.api.common import PipelineContext
-from datahub.ingestion.api.decorators import (
- SourceCapability,
- SupportStatus,
- capability,
- config_class,
- platform_name,
- support_status,
-)
-from datahub.ingestion.source.sql.sql_common import (
- SQLAlchemySource,
- register_custom_type,
-)
-from datahub.ingestion.source.sql.sql_config import (
- BasicSQLAlchemyConfig,
- make_sqlalchemy_uri,
-)
-from datahub.metadata.schema_classes import BooleanTypeClass, UnionTypeClass
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-register_custom_type(sqlalchemy.dialects.mssql.BIT, BooleanTypeClass)
-register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, UnionTypeClass)
-
-
-class SQLServerConfig(BasicSQLAlchemyConfig):
- # defaults
- host_port: str = Field(default="localhost:1433", description="MSSQL host URL.")
- scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True)
- use_odbc: bool = Field(
- default=False,
- description="See https://docs.sqlalchemy.org/en/14/dialects/mssql.html#module-sqlalchemy.dialects.mssql.pyodbc.",
- )
- uri_args: Dict[str, str] = Field(
- default={},
- description="Arguments to URL-encode when connecting. See https://docs.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver15.",
- )
- database_pattern: AllowDenyPattern = Field(
- default=AllowDenyPattern.allow_all(),
- description="Regex patterns for databases to filter in ingestion.",
- )
- database: Optional[str] = Field(
- default=None,
- description="database (catalog). If set to Null, all databases will be considered for ingestion.",
- )
- convert_urns_to_lowercase: bool = Field(
- default=False,
- description="Enable to convert the SQL Server assets urns to lowercase",
- )
-
- @pydantic.validator("uri_args")
- def passwords_match(cls, v, values, **kwargs):
- if values["use_odbc"] and "driver" not in v:
- raise ValueError("uri_args must contain a 'driver' option")
- elif not values["use_odbc"] and v:
- raise ValueError("uri_args is not supported when ODBC is disabled")
- return v
-
- def get_sql_alchemy_url(
- self,
- uri_opts: Optional[Dict[str, Any]] = None,
- current_db: Optional[str] = None,
- ) -> str:
- if self.use_odbc:
- # Ensure that the import is available.
- import pyodbc # noqa: F401
-
- self.scheme = "mssql+pyodbc"
-
- uri: str = self.sqlalchemy_uri or make_sqlalchemy_uri(
- self.scheme, # type: ignore
- self.username,
- self.password.get_secret_value() if self.password else None,
- self.host_port, # type: ignore
- current_db if current_db else self.database,
- uri_opts=uri_opts,
- )
- if self.use_odbc:
- uri = f"{uri}?{urllib.parse.urlencode(self.uri_args)}"
- return uri
-
-
-@platform_name("Microsoft SQL Server", id="mssql")
-@config_class(SQLServerConfig)
-@support_status(SupportStatus.CERTIFIED)
-@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
-@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
-@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
-@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
-@capability(
- SourceCapability.USAGE_STATS,
- "Not provided by this module, use `bigquery-usage` for that.",
- supported=False,
-)
-@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
-class SQLServerSource(SQLAlchemySource):
- """
- This plugin extracts the following:
-
- - Metadata for databases, schemas, views and tables
- - Column types associated with each table/view
- - Table, row, and column statistics via optional SQL profiling
-
- We have two options for the underlying library used to connect to SQL Server: (1) [python-tds](https://github.com/denisenkom/pytds) and (2) [pyodbc](https://github.com/mkleehammer/pyodbc). The TDS library is pure Python and hence easier to install, but only PyODBC supports encrypted connections.
- """
-
- def __init__(self, config: SQLServerConfig, ctx: PipelineContext):
- super().__init__(config, ctx, "mssql")
- # Cache the table and column descriptions
- self.config: SQLServerConfig = config
- self.current_database = None
- self.table_descriptions: Dict[str, str] = {}
- self.column_descriptions: Dict[str, str] = {}
- for inspector in self.get_inspectors():
- db_name: str = self.get_db_name(inspector)
- with inspector.engine.connect() as conn:
- if self.config.use_odbc:
- self._add_output_converters(conn)
- self._populate_table_descriptions(conn, db_name)
- self._populate_column_descriptions(conn, db_name)
-
- @staticmethod
- def _add_output_converters(conn: Connection) -> None:
- def handle_sql_variant_as_string(value):
- return value.decode("utf-16le")
-
- # see https://stackoverflow.com/questions/45677374/pandas-pyodbc-odbc-sql-type-150-is-not-yet-supported
- # and https://stackoverflow.com/questions/11671170/adding-output-converter-to-pyodbc-connection-in-sqlalchemy
- try:
- conn.connection.add_output_converter(-150, handle_sql_variant_as_string)
- except AttributeError as e:
- logger.debug(
- f"Failed to mount output converter for MSSQL data type -150 due to {e}"
- )
-
- def _populate_table_descriptions(self, conn: Connection, db_name: str) -> None:
- # see https://stackoverflow.com/questions/5953330/how-do-i-map-the-id-in-sys-extended-properties-to-an-object-name
- # also see https://www.mssqltips.com/sqlservertip/5384/working-with-sql-server-extended-properties/
- table_metadata = conn.execute(
- """
- SELECT
- SCHEMA_NAME(T.SCHEMA_ID) AS schema_name,
- T.NAME AS table_name,
- EP.VALUE AS table_description
- FROM sys.tables AS T
- INNER JOIN sys.extended_properties AS EP
- ON EP.MAJOR_ID = T.[OBJECT_ID]
- AND EP.MINOR_ID = 0
- AND EP.NAME = 'MS_Description'
- AND EP.CLASS = 1
- """
- )
- for row in table_metadata:
- self.table_descriptions[
- f"{db_name}.{row['schema_name']}.{row['table_name']}"
- ] = row["table_description"]
-
- def _populate_column_descriptions(self, conn: Connection, db_name: str) -> None:
- column_metadata = conn.execute(
- """
- SELECT
- SCHEMA_NAME(T.SCHEMA_ID) AS schema_name,
- T.NAME AS table_name,
- C.NAME AS column_name ,
- EP.VALUE AS column_description
- FROM sys.tables AS T
- INNER JOIN sys.all_columns AS C
- ON C.OBJECT_ID = T.[OBJECT_ID]
- INNER JOIN sys.extended_properties AS EP
- ON EP.MAJOR_ID = T.[OBJECT_ID]
- AND EP.MINOR_ID = C.COLUMN_ID
- AND EP.NAME = 'MS_Description'
- AND EP.CLASS = 1
- """
- )
- for row in column_metadata:
- self.column_descriptions[
- f"{db_name}.{row['schema_name']}.{row['table_name']}.{row['column_name']}"
- ] = row["column_description"]
-
- @classmethod
- def create(cls, config_dict: Dict, ctx: PipelineContext) -> "SQLServerSource":
- config = SQLServerConfig.parse_obj(config_dict)
- return cls(config, ctx)
-
- # override to get table descriptions
- def get_table_properties(
- self, inspector: Inspector, schema: str, table: str
- ) -> Tuple[Optional[str], Dict[str, str], Optional[str]]:
- description, properties, location_urn = super().get_table_properties(
- inspector, schema, table
- )
- # Update description if available.
- db_name: str = self.get_db_name(inspector)
- description = self.table_descriptions.get(
- f"{db_name}.{schema}.{table}", description
- )
- return description, properties, location_urn
-
- # override to get column descriptions
- def _get_columns(
- self, dataset_name: str, inspector: Inspector, schema: str, table: str
- ) -> List[Dict]:
- columns: List[Dict] = super()._get_columns(
- dataset_name, inspector, schema, table
- )
- # Update column description if available.
- db_name: str = self.get_db_name(inspector)
- for column in columns:
- description: Optional[str] = self.column_descriptions.get(
- f"{db_name}.{schema}.{table}.{column['name']}",
- )
- if description:
- column["comment"] = description
- return columns
-
- def get_inspectors(self) -> Iterable[Inspector]:
- # This method can be overridden in the case that you want to dynamically
- # run on multiple databases.
- url = self.config.get_sql_alchemy_url()
- logger.debug(f"sql_alchemy_url={url}")
- engine = create_engine(url, **self.config.options)
- with engine.connect() as conn:
- if self.config.database and self.config.database != "":
- inspector = inspect(conn)
- yield inspector
- else:
- databases = conn.execute(
- "SELECT name FROM master.sys.databases WHERE name NOT IN \
- ('master', 'model', 'msdb', 'tempdb', 'Resource', \
- 'distribution' , 'reportserver', 'reportservertempdb'); "
- )
- for db in databases:
- if self.config.database_pattern.allowed(db["name"]):
- url = self.config.get_sql_alchemy_url(current_db=db["name"])
- with create_engine(
- url, **self.config.options
- ).connect() as conn:
- inspector = inspect(conn)
- self.current_database = db["name"]
- yield inspector
-
- def get_identifier(
- self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
- ) -> str:
- regular = f"{schema}.{entity}"
-
- qualified_table_name = regular
-
- if self.config.database:
- if self.config.database_alias:
- qualified_table_name = f"{self.config.database_alias}.{regular}"
- else:
- qualified_table_name = f"{self.config.database}.{regular}"
-
- if self.current_database:
- qualified_table_name = f"{self.current_database}.{regular}"
-
- return (
- qualified_table_name.lower()
- if self.config.convert_urns_to_lowercase
- else qualified_table_name
- )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/__init__.py
new file mode 100644
index 00000000000000..8db89505a9cf67
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/__init__.py
@@ -0,0 +1 @@
+from datahub.ingestion.source.sql.mssql.source import SQLServerConfig, SQLServerSource
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py
new file mode 100644
index 00000000000000..8aeb5421891aac
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py
@@ -0,0 +1,239 @@
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Union
+
+from datahub.emitter.mce_builder import make_data_flow_urn, make_data_job_urn
+from datahub.metadata.schema_classes import (
+ DataFlowInfoClass,
+ DataJobInfoClass,
+ DataJobInputOutputClass,
+)
+
+
+@dataclass
+class ProcedureDependency:
+ db: str
+ schema: str
+ name: str
+ type: str
+ env: str
+ server: str
+ source: str = "mssql"
+
+
+@dataclass
+class ProcedureLineageStream:
+ dependencies: List[ProcedureDependency]
+
+ @property
+ def as_property(self) -> Dict[str, str]:
+ return {
+ f"{dep.db}.{dep.schema}.{dep.name}": dep.type for dep in self.dependencies
+ }
+
+
+@dataclass
+class MSSQLJob:
+ db: str
+ platform_instance: str
+ name: str
+ env: str
+ source: str = "mssql"
+ type: str = "JOB"
+
+ @property
+ def formatted_name(self) -> str:
+ return f"{self.formatted_platform_instance}.{self.name.replace(',', '-')}"
+
+ @property
+ def full_type(self) -> str:
+ return f"({self.source},{self.formatted_name},{self.env})"
+
+ @property
+ def orchestrator(self) -> str:
+ return self.source
+
+ @property
+ def formatted_platform_instance(self) -> str:
+ return self.platform_instance.replace(".", "/")
+
+ @property
+ def cluster(self) -> str:
+ return f"{self.env}"
+
+
+@dataclass
+class MSSQLProceduresContainer:
+ db: str
+ platform_instance: str
+ name: str
+ env: str
+ source: str = "mssql"
+ type: str = "JOB"
+
+ @property
+ def formatted_name(self) -> str:
+ return f"{self.formatted_platform_instance}.{self.name.replace(',', '-')}"
+
+ @property
+ def orchestrator(self) -> str:
+ return self.source
+
+ @property
+ def formatted_platform_instance(self) -> str:
+ return self.platform_instance.replace(".", "/")
+
+ @property
+ def cluster(self) -> str:
+ return f"{self.env}"
+
+ @property
+ def full_type(self) -> str:
+ return f"({self.source},{self.name},{self.env})"
+
+
+@dataclass
+class ProcedureParameter:
+ name: str
+ type: str
+
+ @property
+ def properties(self) -> Dict[str, str]:
+ return {"type": self.type}
+
+
+@dataclass
+class StoredProcedure:
+ db: str
+ schema: str
+ name: str
+ flow: Union[MSSQLJob, MSSQLProceduresContainer]
+ type: str = "STORED_PROCEDURE"
+ source: str = "mssql"
+
+ @property
+ def full_type(self) -> str:
+ return self.source.upper() + "_" + self.type
+
+ @property
+ def formatted_name(self) -> str:
+ return self.name.replace(",", "-")
+
+ @property
+ def full_name(self) -> str:
+ return f"{self.db}.{self.schema}.{self.formatted_name}"
+
+ @property
+ def escape_full_name(self) -> str:
+ return f"[{self.db}].[{self.schema}].[{self.formatted_name}]"
+
+
+@dataclass
+class JobStep:
+ job_name: str
+ step_name: str
+ flow: MSSQLJob
+ type: str = "JOB_STEP"
+ source: str = "mssql"
+
+ @property
+ def formatted_step(self) -> str:
+ return self.step_name.replace(",", "-").replace(" ", "_").lower()
+
+ @property
+ def formatted_name(self) -> str:
+ return self.job_name.replace(",", "-")
+
+ @property
+ def full_type(self) -> str:
+ return self.source.upper() + "_" + self.type
+
+ @property
+ def full_name(self) -> str:
+ return f"{self.formatted_name}.{self.formatted_name}"
+
+
+@dataclass
+class MSSQLDataJob:
+ entity: Union[StoredProcedure, JobStep]
+ type: str = "dataJob"
+ source: str = "mssql"
+ external_url: str = ""
+ description: Optional[str] = None
+ status: Optional[str] = None
+ incoming: List[str] = field(default_factory=list)
+ outgoing: List[str] = field(default_factory=list)
+ input_jobs: List[str] = field(default_factory=list)
+ job_properties: Dict[str, str] = field(default_factory=dict)
+
+ @property
+ def urn(self) -> str:
+ return make_data_job_urn(
+ orchestrator=self.entity.flow.orchestrator,
+ flow_id=self.entity.flow.formatted_name,
+ job_id=self.entity.formatted_name,
+ cluster=self.entity.flow.cluster,
+ )
+
+ def add_property(
+ self,
+ name: str,
+ value: str,
+ ) -> None:
+ self.job_properties[name] = value
+
+ @property
+ def valued_properties(self) -> Dict[str, str]:
+ if self.job_properties:
+ return {k: v for k, v in self.job_properties.items() if v is not None}
+ return self.job_properties
+
+ @property
+ def as_datajob_input_output_aspect(self) -> DataJobInputOutputClass:
+ return DataJobInputOutputClass(
+ inputDatasets=sorted(self.incoming),
+ outputDatasets=sorted(self.outgoing),
+ inputDatajobs=sorted(self.input_jobs),
+ )
+
+ @property
+ def as_datajob_info_aspect(self) -> DataJobInfoClass:
+ return DataJobInfoClass(
+ name=self.entity.full_name,
+ type=self.entity.full_type,
+ description=self.description,
+ customProperties=self.valued_properties,
+ externalUrl=self.external_url,
+ status=self.status,
+ )
+
+
+@dataclass
+class MSSQLDataFlow:
+ entity: Union[MSSQLJob, MSSQLProceduresContainer]
+ type: str = "dataFlow"
+ source: str = "mssql"
+ external_url: str = ""
+ flow_properties: Dict[str, str] = field(default_factory=dict)
+
+ def add_property(
+ self,
+ name: str,
+ value: str,
+ ) -> None:
+ self.flow_properties[name] = value
+
+ @property
+ def urn(self) -> str:
+ return make_data_flow_urn(
+ orchestrator=self.entity.orchestrator,
+ flow_id=self.entity.formatted_name,
+ cluster=self.entity.cluster,
+ )
+
+ @property
+ def as_dataflow_info_aspect(self) -> DataFlowInfoClass:
+ return DataFlowInfoClass(
+ name=self.entity.formatted_name,
+ customProperties=self.flow_properties,
+ externalUrl=self.external_url,
+ )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py
new file mode 100644
index 00000000000000..3c7701d93edebc
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py
@@ -0,0 +1,665 @@
+import logging
+import re
+import urllib.parse
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import pydantic
+import sqlalchemy.dialects.mssql
+
+# This import verifies that the dependencies are available.
+import sqlalchemy_pytds # noqa: F401
+from pydantic.fields import Field
+from sqlalchemy import create_engine, inspect
+from sqlalchemy.engine.base import Connection
+from sqlalchemy.engine.reflection import Inspector
+from sqlalchemy.exc import ProgrammingError, ResourceClosedError
+
+from datahub.configuration.common import AllowDenyPattern
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.api.decorators import (
+ SourceCapability,
+ SupportStatus,
+ capability,
+ config_class,
+ platform_name,
+ support_status,
+)
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.sql.mssql.job_models import (
+ JobStep,
+ MSSQLDataFlow,
+ MSSQLDataJob,
+ MSSQLJob,
+ MSSQLProceduresContainer,
+ ProcedureDependency,
+ ProcedureLineageStream,
+ ProcedureParameter,
+ StoredProcedure,
+)
+from datahub.ingestion.source.sql.sql_common import (
+ SQLAlchemySource,
+ SqlWorkUnit,
+ register_custom_type,
+)
+from datahub.ingestion.source.sql.sql_config import (
+ BasicSQLAlchemyConfig,
+ make_sqlalchemy_uri,
+)
+from datahub.metadata.schema_classes import BooleanTypeClass, UnionTypeClass
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+register_custom_type(sqlalchemy.dialects.mssql.BIT, BooleanTypeClass)
+register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, UnionTypeClass)
+
+
+class SQLServerConfig(BasicSQLAlchemyConfig):
+ # defaults
+ host_port: str = Field(default="localhost:1433", description="MSSQL host URL.")
+ scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True)
+ include_stored_procedures: bool = Field(
+ default=True,
+ description="Include ingest of stored procedures. Requires access to the 'sys' schema.",
+ )
+ include_stored_procedures_code: bool = Field(
+ default=True, description="Include information about object code."
+ )
+ include_jobs: bool = Field(
+ default=True,
+ description="Include ingest of MSSQL Jobs. Requires access to the 'msdb' and 'sys' schema.",
+ )
+ include_descriptions: bool = Field(
+ default=True, description="Include table descriptions information."
+ )
+ use_odbc: bool = Field(
+ default=False,
+ description="See https://docs.sqlalchemy.org/en/14/dialects/mssql.html#module-sqlalchemy.dialects.mssql.pyodbc.",
+ )
+ uri_args: Dict[str, str] = Field(
+ default={},
+ description="Arguments to URL-encode when connecting. See https://docs.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver15.",
+ )
+ database_pattern: AllowDenyPattern = Field(
+ default=AllowDenyPattern.allow_all(),
+ description="Regex patterns for databases to filter in ingestion.",
+ )
+ database: Optional[str] = Field(
+ default=None,
+ description="database (catalog). If set to Null, all databases will be considered for ingestion.",
+ )
+ convert_urns_to_lowercase: bool = Field(
+ default=False,
+ description="Enable to convert the SQL Server assets urns to lowercase",
+ )
+
+ @pydantic.validator("uri_args")
+ def passwords_match(cls, v, values, **kwargs):
+ if values["use_odbc"] and "driver" not in v:
+ raise ValueError("uri_args must contain a 'driver' option")
+ elif not values["use_odbc"] and v:
+ raise ValueError("uri_args is not supported when ODBC is disabled")
+ return v
+
+ def get_sql_alchemy_url(
+ self,
+ uri_opts: Optional[Dict[str, Any]] = None,
+ current_db: Optional[str] = None,
+ ) -> str:
+ if self.use_odbc:
+ # Ensure that the import is available.
+ import pyodbc # noqa: F401
+
+ self.scheme = "mssql+pyodbc"
+
+ uri: str = self.sqlalchemy_uri or make_sqlalchemy_uri(
+ self.scheme, # type: ignore
+ self.username,
+ self.password.get_secret_value() if self.password else None,
+ self.host_port, # type: ignore
+ current_db if current_db else self.database,
+ uri_opts=uri_opts,
+ )
+ if self.use_odbc:
+ uri = f"{uri}?{urllib.parse.urlencode(self.uri_args)}"
+ return uri
+
+ @property
+ def host(self):
+ return self.platform_instance or self.host_port.split(":")[0]
+
+ @property
+ def db(self):
+ return self.database_alias or self.database
+
+
+@platform_name("Microsoft SQL Server", id="mssql")
+@config_class(SQLServerConfig)
+@support_status(SupportStatus.CERTIFIED)
+@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
+@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
+@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
+@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
+@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
+class SQLServerSource(SQLAlchemySource):
+ """
+ This plugin extracts the following:
+ - Metadata for databases, schemas, views and tables
+ - Column types associated with each table/view
+ - Table, row, and column statistics via optional SQL profiling
+ We have two options for the underlying library used to connect to SQL Server: (1) [python-tds](https://github.com/denisenkom/pytds) and (2) [pyodbc](https://github.com/mkleehammer/pyodbc). The TDS library is pure Python and hence easier to install, but only PyODBC supports encrypted connections.
+ """
+
+ def __init__(self, config: SQLServerConfig, ctx: PipelineContext):
+ super().__init__(config, ctx, "mssql")
+ # Cache the table and column descriptions
+ self.config: SQLServerConfig = config
+ self.current_database = None
+ self.table_descriptions: Dict[str, str] = {}
+ self.column_descriptions: Dict[str, str] = {}
+ if self.config.include_descriptions:
+ for inspector in self.get_inspectors():
+ db_name: str = self.get_db_name(inspector)
+ with inspector.engine.connect() as conn:
+ if self.config.use_odbc:
+ self._add_output_converters(conn)
+ self._populate_table_descriptions(conn, db_name)
+ self._populate_column_descriptions(conn, db_name)
+
+ @staticmethod
+ def _add_output_converters(conn: Connection) -> None:
+ def handle_sql_variant_as_string(value):
+ try:
+ return value.decode("utf-16le")
+ except UnicodeDecodeError:
+ return value.decode("Windows-1251")
+
+ # see https://stackoverflow.com/questions/45677374/pandas-pyodbc-odbc-sql-type-150-is-not-yet-supported
+ # and https://stackoverflow.com/questions/11671170/adding-output-converter-to-pyodbc-connection-in-sqlalchemy
+ try:
+ conn.connection.add_output_converter(-150, handle_sql_variant_as_string)
+ except AttributeError as e:
+ logger.debug(
+ f"Failed to mount output converter for MSSQL data type -150 due to {e}"
+ )
+
+ def _populate_table_descriptions(self, conn: Connection, db_name: str) -> None:
+ # see https://stackoverflow.com/questions/5953330/how-do-i-map-the-id-in-sys-extended-properties-to-an-object-name
+ # also see https://www.mssqltips.com/sqlservertip/5384/working-with-sql-server-extended-properties/
+ table_metadata = conn.execute(
+ """
+ SELECT
+ SCHEMA_NAME(T.SCHEMA_ID) AS schema_name,
+ T.NAME AS table_name,
+ EP.VALUE AS table_description
+ FROM sys.tables AS T
+ INNER JOIN sys.extended_properties AS EP
+ ON EP.MAJOR_ID = T.[OBJECT_ID]
+ AND EP.MINOR_ID = 0
+ AND EP.NAME = 'MS_Description'
+ AND EP.CLASS = 1
+ """
+ )
+ for row in table_metadata:
+ self.table_descriptions[
+ f"{db_name}.{row['schema_name']}.{row['table_name']}"
+ ] = row["table_description"]
+
+ def _populate_column_descriptions(self, conn: Connection, db_name: str) -> None:
+ column_metadata = conn.execute(
+ """
+ SELECT
+ SCHEMA_NAME(T.SCHEMA_ID) AS schema_name,
+ T.NAME AS table_name,
+ C.NAME AS column_name ,
+ EP.VALUE AS column_description
+ FROM sys.tables AS T
+ INNER JOIN sys.all_columns AS C
+ ON C.OBJECT_ID = T.[OBJECT_ID]
+ INNER JOIN sys.extended_properties AS EP
+ ON EP.MAJOR_ID = T.[OBJECT_ID]
+ AND EP.MINOR_ID = C.COLUMN_ID
+ AND EP.NAME = 'MS_Description'
+ AND EP.CLASS = 1
+ """
+ )
+ for row in column_metadata:
+ self.column_descriptions[
+ f"{db_name}.{row['schema_name']}.{row['table_name']}.{row['column_name']}"
+ ] = row["column_description"]
+
+ @classmethod
+ def create(cls, config_dict: Dict, ctx: PipelineContext) -> "SQLServerSource":
+ config = SQLServerConfig.parse_obj(config_dict)
+ return cls(config, ctx)
+
+ # override to get table descriptions
+ def get_table_properties(
+ self, inspector: Inspector, schema: str, table: str
+ ) -> Tuple[Optional[str], Dict[str, str], Optional[str]]:
+ description, properties, location_urn = super().get_table_properties(
+ inspector, schema, table
+ )
+ # Update description if available.
+ db_name: str = self.get_db_name(inspector)
+ description = self.table_descriptions.get(
+ f"{db_name}.{schema}.{table}", description
+ )
+ return description, properties, location_urn
+
+ # override to get column descriptions
+ def _get_columns(
+ self, dataset_name: str, inspector: Inspector, schema: str, table: str
+ ) -> List[Dict]:
+ columns: List[Dict] = super()._get_columns(
+ dataset_name, inspector, schema, table
+ )
+ # Update column description if available.
+ db_name: str = self.get_db_name(inspector)
+ for column in columns:
+ description: Optional[str] = self.column_descriptions.get(
+ f"{db_name}.{schema}.{table}.{column['name']}",
+ )
+ if description:
+ column["comment"] = description
+ return columns
+
+ def get_database_level_workunits(
+ self,
+ inspector: Inspector,
+ database: str,
+ ) -> Iterable[MetadataWorkUnit]:
+ yield from super().get_database_level_workunits(
+ inspector=inspector,
+ database=database,
+ )
+ if self.config.include_jobs:
+ try:
+ yield from self.loop_jobs(inspector, self.config)
+ except Exception as e:
+ self.report.report_failure(
+ "jobs",
+ f"Failed to list jobs due to error {e}",
+ )
+
+ def get_schema_level_workunits(
+ self,
+ inspector: Inspector,
+ schema: str,
+ database: str,
+ ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
+ yield from super().get_schema_level_workunits(
+ inspector=inspector,
+ schema=schema,
+ database=database,
+ )
+ if self.config.include_stored_procedures:
+ try:
+ yield from self.loop_stored_procedures(inspector, schema, self.config)
+ except Exception as e:
+ self.report.report_failure(
+ "jobs",
+ f"Failed to list jobs due to error {e}",
+ )
+
+ def _get_jobs(self, conn: Connection, db_name: str) -> Dict[str, Dict[str, Any]]:
+ jobs_data = conn.execute(
+ f"""
+ SELECT
+ job.job_id,
+ job.name,
+ job.description,
+ job.date_created,
+ job.date_modified,
+ steps.step_id,
+ steps.step_name,
+ steps.subsystem,
+ steps.command,
+ steps.database_name
+ FROM
+ msdb.dbo.sysjobs job
+ INNER JOIN
+ msdb.dbo.sysjobsteps steps
+ ON
+ job.job_id = steps.job_id
+ where database_name = '{db_name}'
+ """
+ )
+ jobs: Dict[str, Dict[str, Any]] = {}
+ for row in jobs_data:
+ step_data = dict(
+ job_id=row["job_id"],
+ job_name=row["name"],
+ description=row["description"],
+ date_created=row["date_created"],
+ date_modified=row["date_modified"],
+ step_id=row["step_id"],
+ step_name=row["step_name"],
+ subsystem=row["subsystem"],
+ command=row["command"],
+ )
+ if row["name"] in jobs:
+ jobs[row["name"]][row["step_id"]] = step_data
+ else:
+ jobs[row["name"]] = {row["step_id"]: step_data}
+ return jobs
+
+ def loop_jobs(
+ self,
+ inspector: Inspector,
+ sql_config: SQLServerConfig,
+ ) -> Iterable[MetadataWorkUnit]:
+ """
+ Loop MS SQL jobs as dataFlow-s.
+ :return:
+ """
+ db_name = self.get_db_name(inspector)
+ with inspector.engine.connect() as conn:
+ jobs = self._get_jobs(conn, db_name)
+ for job_name, job_steps in jobs.items():
+ job = MSSQLJob(
+ name=job_name,
+ env=sql_config.env,
+ db=db_name,
+ platform_instance=sql_config.host,
+ )
+ data_flow = MSSQLDataFlow(entity=job)
+ yield from self.construct_flow_workunits(data_flow=data_flow)
+ yield from self.loop_job_steps(job, job_steps)
+
+ def loop_job_steps(
+ self, job: MSSQLJob, job_steps: Dict[str, Any]
+ ) -> Iterable[MetadataWorkUnit]:
+ for step_id, step_data in job_steps.items():
+ step = JobStep(
+ job_name=job.formatted_name,
+ step_name=step_data["step_name"],
+ flow=job,
+ )
+ data_job = MSSQLDataJob(entity=step)
+ for data_name, data_value in step_data.items():
+ data_job.add_property(name=data_name, value=str(data_value))
+ yield from self.construct_job_workunits(data_job)
+
+ def loop_stored_procedures( # noqa: C901
+ self,
+ inspector: Inspector,
+ schema: str,
+ sql_config: SQLServerConfig,
+ ) -> Iterable[MetadataWorkUnit]:
+ """
+ Loop schema data for get stored procedures as dataJob-s.
+ """
+ db_name = self.get_db_name(inspector)
+ procedure_flow_name = f"{db_name}.{schema}.stored_procedures"
+ mssql_default_job = MSSQLProceduresContainer(
+ name=procedure_flow_name,
+ env=sql_config.env,
+ db=db_name,
+ platform_instance=sql_config.host,
+ )
+ data_flow = MSSQLDataFlow(entity=mssql_default_job)
+ with inspector.engine.connect() as conn:
+ procedures_data_list = self._get_stored_procedures(conn, db_name, schema)
+ procedures = [
+ StoredProcedure(flow=mssql_default_job, **procedure_data)
+ for procedure_data in procedures_data_list
+ ]
+ if procedures:
+ yield from self.construct_flow_workunits(data_flow=data_flow)
+ for procedure in procedures:
+ upstream = self._get_procedure_upstream(conn, procedure)
+ downstream = self._get_procedure_downstream(conn, procedure)
+ data_job = MSSQLDataJob(
+ entity=procedure,
+ )
+ # TODO: because of this upstream and downstream are more dependencies,
+ # can't be used as DataJobInputOutput.
+ # Should be reorganized into lineage.
+ data_job.add_property("procedure_depends_on", str(upstream.as_property))
+ data_job.add_property(
+ "depending_on_procedure", str(downstream.as_property)
+ )
+ procedure_definition, procedure_code = self._get_procedure_code(
+ conn, procedure
+ )
+ if procedure_definition:
+ data_job.add_property("definition", procedure_definition)
+ if sql_config.include_stored_procedures_code and procedure_code:
+ data_job.add_property("code", procedure_code)
+ procedure_inputs = self._get_procedure_inputs(conn, procedure)
+ properties = self._get_procedure_properties(conn, procedure)
+ data_job.add_property(
+ "input parameters", str([param.name for param in procedure_inputs])
+ )
+ for param in procedure_inputs:
+ data_job.add_property(
+ f"parameter {param.name}", str(param.properties)
+ )
+ for property_name, property_value in properties.items():
+ data_job.add_property(property_name, str(property_value))
+ yield from self.construct_job_workunits(data_job)
+
+ @staticmethod
+ def _get_procedure_downstream(
+ conn: Connection, procedure: StoredProcedure
+ ) -> ProcedureLineageStream:
+ downstream_data = conn.execute(
+ f"""
+ SELECT DISTINCT OBJECT_SCHEMA_NAME ( referencing_id ) AS [schema],
+ OBJECT_NAME(referencing_id) AS [name],
+ o.type_desc AS [type]
+ FROM sys.sql_expression_dependencies AS sed
+ INNER JOIN sys.objects AS o ON sed.referencing_id = o.object_id
+ left join sys.objects o1 on sed.referenced_id = o1.object_id
+ WHERE referenced_id = OBJECT_ID(N'{procedure.escape_full_name}')
+ AND o.type_desc in ('TABLE_TYPE', 'VIEW', 'USER_TABLE')
+ """
+ )
+ downstream_dependencies = []
+ for row in downstream_data:
+ downstream_dependencies.append(
+ ProcedureDependency(
+ db=procedure.db,
+ schema=row["schema"],
+ name=row["name"],
+ type=row["type"],
+ env=procedure.flow.env,
+ server=procedure.flow.platform_instance,
+ )
+ )
+ return ProcedureLineageStream(dependencies=downstream_dependencies)
+
+ @staticmethod
+ def _get_procedure_upstream(
+ conn: Connection, procedure: StoredProcedure
+ ) -> ProcedureLineageStream:
+ upstream_data = conn.execute(
+ f"""
+ SELECT DISTINCT
+ coalesce(lower(referenced_database_name), db_name()) AS db,
+ referenced_schema_name AS [schema],
+ referenced_entity_name AS [name],
+ o1.type_desc AS [type]
+ FROM sys.sql_expression_dependencies AS sed
+ INNER JOIN sys.objects AS o ON sed.referencing_id = o.object_id
+ left join sys.objects o1 on sed.referenced_id = o1.object_id
+ WHERE referencing_id = OBJECT_ID(N'{procedure.escape_full_name}')
+ AND referenced_schema_name is not null
+ AND o1.type_desc in ('TABLE_TYPE', 'VIEW', 'SQL_STORED_PROCEDURE', 'USER_TABLE')
+ """
+ )
+ upstream_dependencies = []
+ for row in upstream_data:
+ upstream_dependencies.append(
+ ProcedureDependency(
+ db=row["db"],
+ schema=row["schema"],
+ name=row["name"],
+ type=row["type"],
+ env=procedure.flow.env,
+ server=procedure.flow.platform_instance,
+ )
+ )
+ return ProcedureLineageStream(dependencies=upstream_dependencies)
+
+ @staticmethod
+ def _get_procedure_inputs(
+ conn: Connection, procedure: StoredProcedure
+ ) -> List[ProcedureParameter]:
+ inputs_data = conn.execute(
+ f"""
+ SELECT
+ name,
+ type_name(user_type_id) AS 'type'
+ FROM sys.parameters
+ WHERE object_id = object_id('{procedure.escape_full_name}')
+ """
+ )
+ inputs_list = []
+ for row in inputs_data:
+ inputs_list.append(ProcedureParameter(name=row["name"], type=row["type"]))
+ return inputs_list
+
+ @staticmethod
+ def _get_procedure_code(
+ conn: Connection, procedure: StoredProcedure
+ ) -> Tuple[Optional[str], Optional[str]]:
+ query = f"EXEC [{procedure.db}].dbo.sp_helptext '{procedure.full_name}'"
+ try:
+ code_data = conn.execute(query)
+ except ProgrammingError:
+ logger.warning(
+ "Denied permission for read text from procedure '%s'",
+ procedure.full_name,
+ )
+ return None, None
+ code_list = []
+ code_slice_index = 0
+ code_slice_text = "create procedure"
+ try:
+ for index, row in enumerate(code_data):
+ code_list.append(row["Text"])
+ if code_slice_text in re.sub(" +", " ", row["Text"].lower()).strip():
+ code_slice_index = index
+ definition = "\n".join(code_list[:code_slice_index])
+ code = "\n".join(code_list[code_slice_index:])
+ except ResourceClosedError:
+ logger.warning(
+ "Connection was closed from procedure '%s'",
+ procedure.full_name,
+ )
+ return None, None
+ return definition, code
+
+ @staticmethod
+ def _get_procedure_properties(
+ conn: Connection, procedure: StoredProcedure
+ ) -> Dict[str, Any]:
+ properties_data = conn.execute(
+ f"""
+ SELECT
+ create_date as date_created,
+ modify_date as date_modified
+ FROM sys.procedures
+ WHERE object_id = object_id('{procedure.full_name}')
+ """
+ )
+ properties = {}
+ for row in properties_data:
+ properties = dict(
+ date_created=row["date_created"], date_modified=row["date_modified"]
+ )
+ return properties
+
+ @staticmethod
+ def _get_stored_procedures(
+ conn: Connection, db_name: str, schema: str
+ ) -> List[Dict[str, str]]:
+ stored_procedures_data = conn.execute(
+ f"""
+ SELECT
+ pr.name as procedure_name,
+ s.name as schema_name
+ FROM
+ [{db_name}].[sys].[procedures] pr
+ INNER JOIN
+ [{db_name}].[sys].[schemas] s ON pr.schema_id = s.schema_id
+ where s.name = '{schema}'
+ """
+ )
+ procedures_list = []
+ for row in stored_procedures_data:
+ procedures_list.append(
+ dict(db=db_name, schema=row["schema_name"], name=row["procedure_name"])
+ )
+ return procedures_list
+
+ def construct_job_workunits(
+ self,
+ data_job: MSSQLDataJob,
+ ) -> Iterable[MetadataWorkUnit]:
+ yield MetadataChangeProposalWrapper(
+ entityUrn=data_job.urn,
+ aspect=data_job.as_datajob_info_aspect,
+ ).as_workunit()
+
+ yield MetadataChangeProposalWrapper(
+ entityUrn=data_job.urn,
+ aspect=data_job.as_datajob_input_output_aspect,
+ ).as_workunit()
+ # TODO: Add SubType when it appear
+
+ def construct_flow_workunits(
+ self,
+ data_flow: MSSQLDataFlow,
+ ) -> Iterable[MetadataWorkUnit]:
+ yield MetadataChangeProposalWrapper(
+ entityUrn=data_flow.urn,
+ aspect=data_flow.as_dataflow_info_aspect,
+ ).as_workunit()
+ # TODO: Add SubType when it appear
+
+ def get_inspectors(self) -> Iterable[Inspector]:
+ # This method can be overridden in the case that you want to dynamically
+ # run on multiple databases.
+ url = self.config.get_sql_alchemy_url()
+ logger.debug(f"sql_alchemy_url={url}")
+ engine = create_engine(url, **self.config.options)
+ with engine.connect() as conn:
+ if self.config.database and self.config.database != "":
+ inspector = inspect(conn)
+ yield inspector
+ else:
+ databases = conn.execute(
+ "SELECT name FROM master.sys.databases WHERE name NOT IN \
+ ('master', 'model', 'msdb', 'tempdb', 'Resource', \
+ 'distribution' , 'reportserver', 'reportservertempdb'); "
+ )
+ for db in databases:
+ if self.config.database_pattern.allowed(db["name"]):
+ url = self.config.get_sql_alchemy_url(current_db=db["name"])
+ with create_engine(
+ url, **self.config.options
+ ).connect() as conn:
+ inspector = inspect(conn)
+ self.current_database = db["name"]
+ yield inspector
+
+ def get_identifier(
+ self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
+ ) -> str:
+ regular = f"{schema}.{entity}"
+ qualified_table_name = regular
+ if self.config.database:
+ if self.config.database_alias:
+ qualified_table_name = f"{self.config.database_alias}.{regular}"
+ else:
+ qualified_table_name = f"{self.config.database}.{regular}"
+ if self.current_database:
+ qualified_table_name = f"{self.current_database}.{regular}"
+ return (
+ qualified_table_name.lower()
+ if self.config.convert_urns_to_lowercase
+ else qualified_table_name
+ )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
index 280f4f47adcdf3..b5458a42192fc8 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
@@ -478,6 +478,27 @@ def add_table_to_schema_container(
parent_container_key=schema_container_key,
)
+ def get_database_level_workunits(
+ self,
+ inspector: Inspector,
+ database: str,
+ ) -> Iterable[MetadataWorkUnit]:
+ yield from self.gen_database_containers(database=database)
+
+ def get_schema_level_workunits(
+ self,
+ inspector: Inspector,
+ schema: str,
+ database: str,
+ ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
+ yield from self.gen_schema_containers(schema=schema, database=database)
+
+ if self.config.include_tables:
+ yield from self.loop_tables(inspector, schema, self.config)
+
+ if self.config.include_views:
+ yield from self.loop_views(inspector, schema, self.config)
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
return [
*super().get_workunit_processors(),
@@ -516,27 +537,20 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit
)
db_name = self.get_db_name(inspector)
- yield from self.gen_database_containers(
+ yield from self.get_database_level_workunits(
+ inspector=inspector,
database=db_name,
)
for schema in self.get_allowed_schemas(inspector, db_name):
self.add_information_for_schema(inspector, schema)
- yield from self.gen_schema_containers(
- database=db_name,
+ yield from self.get_schema_level_workunits(
+ inspector=inspector,
schema=schema,
- extra_properties=self.get_schema_properties(
- inspector=inspector, schema=schema, database=db_name
- ),
+ database=db_name,
)
- if sql_config.include_tables:
- yield from self.loop_tables(inspector, schema, sql_config)
-
- if sql_config.include_views:
- yield from self.loop_views(inspector, schema, sql_config)
-
if profiler:
profile_requests += list(
self.loop_profiler_requests(inspector, schema, sql_config)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py
new file mode 100644
index 00000000000000..2fcc93292c2efe
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py
@@ -0,0 +1,223 @@
+import json
+import logging
+import os
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from functools import partial
+from typing import Iterable, List, Optional, Set
+
+from pydantic import Field
+
+from datahub.configuration.source_common import (
+ EnvConfigMixin,
+ PlatformInstanceConfigMixin,
+)
+from datahub.emitter.mce_builder import (
+ make_dataset_urn_with_platform_instance,
+ make_user_urn,
+)
+from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
+from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.api.decorators import (
+ SupportStatus,
+ config_class,
+ platform_name,
+ support_status,
+)
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport
+from datahub.ingestion.api.source_helpers import auto_workunit_reporter
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.graph.client import DataHubGraph
+from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
+from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage
+
+logger = logging.getLogger(__name__)
+
+
+class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
+ query_file: str = Field(description="Path to file to ingest")
+
+ platform: str = Field(
+ description="The platform for which to generate data, e.g. snowflake"
+ )
+
+ usage: BaseUsageConfig = Field(
+ description="The usage config to use when generating usage statistics",
+ default=BaseUsageConfig(),
+ )
+
+ use_schema_resolver: bool = Field(
+ description="Read SchemaMetadata aspects from DataHub to aid in SQL parsing. Turn off only for testing.",
+ default=True,
+ hidden_from_docs=True,
+ )
+ default_db: Optional[str] = Field(
+ description="The default database to use for unqualified table names",
+ default=None,
+ )
+ default_schema: Optional[str] = Field(
+ description="The default schema to use for unqualified table names",
+ default=None,
+ )
+
+
+class SqlQueriesSourceReport(SourceReport):
+ num_queries_parsed: int = 0
+ num_table_parse_failures: int = 0
+ num_column_parse_failures: int = 0
+
+ def compute_stats(self) -> None:
+ super().compute_stats()
+ self.table_failure_rate = (
+ f"{self.num_table_parse_failures / self.num_queries_parsed:.4f}"
+ if self.num_queries_parsed
+ else "0"
+ )
+ self.column_failure_rate = (
+ f"{self.num_column_parse_failures / self.num_queries_parsed:.4f}"
+ if self.num_queries_parsed
+ else "0"
+ )
+
+
+@platform_name("SQL Queries")
+@config_class(SqlQueriesSourceConfig)
+@support_status(SupportStatus.TESTING)
+class SqlQueriesSource(Source):
+ # TODO: Documentation
+ urns: Optional[Set[str]]
+ schema_resolver: SchemaResolver
+ builder: SqlParsingBuilder
+
+ def __init__(self, ctx: PipelineContext, config: SqlQueriesSourceConfig):
+ if not ctx.graph:
+ raise ValueError(
+ "SqlQueriesSource needs a datahub_api from which to pull schema metadata"
+ )
+
+ self.graph: DataHubGraph = ctx.graph
+ self.ctx = ctx
+ self.config = config
+ self.report = SqlQueriesSourceReport()
+
+ self.builder = SqlParsingBuilder(usage_config=self.config.usage)
+
+ if self.config.use_schema_resolver:
+ schema_resolver, urns = self.graph.initialize_schema_resolver_from_datahub(
+ platform=self.config.platform,
+ platform_instance=self.config.platform_instance,
+ env=self.config.env,
+ )
+ self.schema_resolver = schema_resolver
+ self.urns = urns
+ else:
+ self.schema_resolver = self.graph._make_schema_resolver(
+ platform=self.config.platform,
+ platform_instance=self.config.platform_instance,
+ env=self.config.env,
+ )
+ self.urns = None
+
+ @classmethod
+ def create(cls, config_dict: dict, ctx: PipelineContext) -> "SqlQueriesSource":
+ config = SqlQueriesSourceConfig.parse_obj(config_dict)
+ return cls(ctx, config)
+
+ def get_report(self) -> SqlQueriesSourceReport:
+ return self.report
+
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+ return [partial(auto_workunit_reporter, self.get_report())]
+
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+ logger.info(f"Parsing queries from {os.path.basename(self.config.query_file)}")
+ with open(self.config.query_file) as f:
+ for line in f:
+ try:
+ query_dict = json.loads(line, strict=False)
+ entry = QueryEntry.create(query_dict, config=self.config)
+ yield from self._process_query(entry)
+ except Exception as e:
+ logger.warning("Error processing query", exc_info=True)
+ self.report.report_warning("process-query", str(e))
+
+ logger.info("Generating workunits")
+ yield from self.builder.gen_workunits()
+
+ def _process_query(self, entry: "QueryEntry") -> Iterable[MetadataWorkUnit]:
+ self.report.num_queries_parsed += 1
+ if self.report.num_queries_parsed % 1000 == 0:
+ logger.info(f"Parsed {self.report.num_queries_parsed} queries")
+
+ result = sqlglot_lineage(
+ sql=entry.query,
+ schema_resolver=self.schema_resolver,
+ default_db=self.config.default_db,
+ default_schema=self.config.default_schema,
+ )
+ if result.debug_info.table_error:
+ logger.info(f"Error parsing table lineage, {result.debug_info.table_error}")
+ self.report.num_table_parse_failures += 1
+ for downstream_urn in set(entry.downstream_tables):
+ self.builder.add_lineage(
+ downstream_urn=downstream_urn,
+ upstream_urns=entry.upstream_tables,
+ timestamp=entry.timestamp,
+ user=entry.user,
+ )
+ return
+ elif result.debug_info.column_error:
+ logger.debug(
+ f"Error parsing column lineage, {result.debug_info.column_error}"
+ )
+ self.report.num_column_parse_failures += 1
+
+ yield from self.builder.process_sql_parsing_result(
+ result,
+ query=entry.query,
+ query_timestamp=entry.timestamp,
+ user=entry.user,
+ custom_operation_type=entry.operation_type,
+ include_urns=self.urns,
+ )
+
+
+@dataclass
+class QueryEntry:
+ query: str
+ timestamp: Optional[datetime]
+ user: Optional[str]
+ operation_type: Optional[str]
+ downstream_tables: List[str]
+ upstream_tables: List[str]
+
+ @classmethod
+ def create(
+ cls, entry_dict: dict, *, config: SqlQueriesSourceConfig
+ ) -> "QueryEntry":
+ return cls(
+ query=entry_dict["query"],
+ timestamp=datetime.fromtimestamp(entry_dict["timestamp"], tz=timezone.utc)
+ if "timestamp" in entry_dict
+ else None,
+ user=make_user_urn(entry_dict["user"]) if "user" in entry_dict else None,
+ operation_type=entry_dict.get("operation_type"),
+ downstream_tables=[
+ make_dataset_urn_with_platform_instance(
+ name=table,
+ platform=config.platform,
+ platform_instance=config.platform_instance,
+ env=config.env,
+ )
+ for table in entry_dict.get("downstream_tables", [])
+ ],
+ upstream_tables=[
+ make_dataset_urn_with_platform_instance(
+ name=table,
+ platform=config.platform,
+ platform_instance=config.platform_instance,
+ env=config.env,
+ )
+ for table in entry_dict.get("upstream_tables", [])
+ ],
+ )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
index 6752bdf5198308..ec0af37089b1da 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
@@ -31,6 +31,7 @@
from tableauserverclient.server.endpoint.exceptions import NonXMLResponseError
import datahub.emitter.mce_builder as builder
+import datahub.utilities.sqlglot_lineage as sqlglot_l
from datahub.configuration.common import (
AllowDenyPattern,
ConfigModel,
@@ -136,12 +137,7 @@
ViewPropertiesClass,
)
from datahub.utilities import config_clean
-from datahub.utilities.sqlglot_lineage import (
- ColumnLineageInfo,
- SchemaResolver,
- SqlParsingResult,
- sqlglot_lineage,
-)
+from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult
logger: logging.Logger = logging.getLogger(__name__)
@@ -1585,42 +1581,14 @@ def parse_custom_sql(
f"Overridden info upstream_db={upstream_db}, platform_instance={platform_instance}, platform={platform}"
)
- parsed_result: Optional["SqlParsingResult"] = None
- try:
- schema_resolver = (
- self.ctx.graph._make_schema_resolver(
- platform=platform,
- platform_instance=platform_instance,
- env=env,
- )
- if self.ctx.graph is not None
- else SchemaResolver(
- platform=platform,
- platform_instance=platform_instance,
- env=env,
- graph=None,
- )
- )
-
- if schema_resolver.graph is None:
- logger.warning(
- "Column Level Lineage extraction would not work as DataHub graph client is None."
- )
-
- parsed_result = sqlglot_lineage(
- query,
- schema_resolver=schema_resolver,
- default_db=upstream_db,
- )
- except Exception as e:
- self.report.report_warning(
- key="csql-lineage",
- reason=f"Unable to retrieve lineage from query. "
- f"Query: {query} "
- f"Reason: {str(e)} ",
- )
-
- return parsed_result
+ return sqlglot_l.create_lineage_sql_parsed_result(
+ query=query,
+ database=upstream_db,
+ platform=platform,
+ platform_instance=platform_instance,
+ env=env,
+ graph=self.ctx.graph,
+ )
def _create_lineage_from_unsupported_csql(
self, csql_urn: str, csql: dict
diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py
index d5da93c7be35e6..49f56b46fb0121 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py
@@ -176,10 +176,8 @@ def _parse_query_via_lineage_runner(self, query: str) -> Optional[StringTableInf
for table in runner.target_tables
],
)
- except Exception:
- logger.info(
- f"Could not parse query via lineage runner, {query}", exc_info=True
- )
+ except Exception as e:
+ logger.info(f"Could not parse query via lineage runner, {query}: {e!r}")
return None
@staticmethod
@@ -202,8 +200,8 @@ def _parse_query_via_spark_sql_plan(self, query: str) -> Optional[StringTableInf
return GenericTableInfo(
source_tables=[t for t in tables if t], target_tables=[]
)
- except Exception:
- logger.info(f"Could not parse query via spark plan, {query}", exc_info=True)
+ except Exception as e:
+ logger.info(f"Could not parse query via spark plan, {query}: {e!r}")
return None
@staticmethod
diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
index e5a9954802019c..534cac5cef2aa0 100644
--- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
+++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
@@ -7,7 +7,6 @@
from collections import defaultdict
from typing import Dict, List, Optional, Set, Tuple, Union
-import pydantic
import pydantic.dataclasses
import sqlglot
import sqlglot.errors
@@ -23,7 +22,7 @@
from datahub.ingestion.api.closeable import Closeable
from datahub.ingestion.graph.client import DataHubGraph
from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
-from datahub.metadata.schema_classes import SchemaMetadataClass
+from datahub.metadata.schema_classes import OperationTypeClass, SchemaMetadataClass
from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict
from datahub.utilities.urns.dataset_urn import DatasetUrn
@@ -34,6 +33,8 @@
# A lightweight table schema: column -> type mapping.
SchemaInfo = Dict[str, str]
+SQL_PARSE_RESULT_CACHE_SIZE = 1000
+
class QueryType(enum.Enum):
CREATE = "CREATE"
@@ -45,6 +46,22 @@ class QueryType(enum.Enum):
UNKNOWN = "UNKNOWN"
+ def to_operation_type(self) -> Optional[str]:
+ if self == QueryType.CREATE:
+ return OperationTypeClass.CREATE
+ elif self == QueryType.INSERT:
+ return OperationTypeClass.INSERT
+ elif self == QueryType.UPDATE:
+ return OperationTypeClass.UPDATE
+ elif self == QueryType.DELETE:
+ return OperationTypeClass.DELETE
+ elif self == QueryType.MERGE:
+ return OperationTypeClass.UPDATE
+ elif self == QueryType.SELECT:
+ return None
+ else:
+ return OperationTypeClass.UNKNOWN
+
def get_query_type_of_sql(expression: sqlglot.exp.Expression) -> QueryType:
# UPGRADE: Once we use Python 3.10, replace this with a match expression.
@@ -623,16 +640,21 @@ def _translate_internal_column_lineage(
)
+def _get_dialect(platform: str) -> str:
+ # TODO: convert datahub platform names to sqlglot dialect
+ if platform == "presto-on-hive":
+ return "hive"
+ else:
+ return platform
+
+
def _sqlglot_lineage_inner(
sql: str,
schema_resolver: SchemaResolver,
default_db: Optional[str] = None,
default_schema: Optional[str] = None,
) -> SqlParsingResult:
- # TODO: convert datahub platform names to sqlglot dialect
- # TODO: Pull the platform name from the schema resolver?
- dialect = schema_resolver.platform
-
+ dialect = _get_dialect(schema_resolver.platform)
if dialect == "snowflake":
# in snowflake, table identifiers must be uppercased to match sqlglot's behavior.
if default_db:
@@ -755,6 +777,7 @@ def _sqlglot_lineage_inner(
)
+@functools.lru_cache(maxsize=SQL_PARSE_RESULT_CACHE_SIZE)
def sqlglot_lineage(
sql: str,
schema_resolver: SchemaResolver,
@@ -825,3 +848,43 @@ def sqlglot_lineage(
table_error=e,
),
)
+
+
+def create_lineage_sql_parsed_result(
+ query: str,
+ database: Optional[str],
+ platform: str,
+ platform_instance: Optional[str],
+ env: str,
+ schema: Optional[str] = None,
+ graph: Optional[DataHubGraph] = None,
+) -> Optional["SqlParsingResult"]:
+
+ parsed_result: Optional["SqlParsingResult"] = None
+ try:
+ schema_resolver = (
+ graph._make_schema_resolver(
+ platform=platform,
+ platform_instance=platform_instance,
+ env=env,
+ )
+ if graph is not None
+ else SchemaResolver(
+ platform=platform,
+ platform_instance=platform_instance,
+ env=env,
+ graph=None,
+ )
+ )
+
+ parsed_result = sqlglot_lineage(
+ query,
+ schema_resolver=schema_resolver,
+ default_db=database,
+ default_schema=schema,
+ )
+ except Exception as e:
+ logger.debug(f"Fail to prase query {query}", exc_info=e)
+ logger.warning("Fail to parse custom SQL")
+
+ return parsed_result
diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
index 3bda6c5cce84b6..cc3ee1f6ceaa47 100644
--- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
+++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
@@ -61,6 +61,7 @@ def test_bigquery_v2_ingest(
"project_ids": ["project-id-1"],
"include_usage_statistics": False,
"include_table_lineage": False,
+ "include_data_platform_instance": True,
}
pipeline_config_dict: Dict[str, Any] = {
diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json
index 6167c63e6c9b85..dee85b40bb7a81 100644
--- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json
+++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json
@@ -262,8 +262,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)",
"type": "VIEW"
@@ -412,8 +412,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json
index e66ec4bb89d8c7..72db36e63daf77 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json
@@ -206,8 +206,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json
index 11e0760decae3f..e5508bdb06b9e0 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json
@@ -206,8 +206,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,datahub-demo.view.faa_flights,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json
index ddfd102cb15b0d..91e13debfa0283 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json
@@ -279,8 +279,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)",
"type": "VIEW"
@@ -429,8 +429,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json
index 54624986216b88..e93079119e4f49 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json
@@ -206,8 +206,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json
index 6cab0db8c33cf1..a9c8efa7cdb980 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json
@@ -206,32 +206,32 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_joined_view,PROD)",
"type": "VIEW"
},
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_joined_view_original_name,PROD)",
"type": "VIEW"
},
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view_has_no_fields,PROD)",
"type": "VIEW"
},
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json
index 9a088a7a8baefc..edd15624a14cd4 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json
@@ -206,24 +206,24 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_joined_view_original_name,PROD)",
"type": "VIEW"
},
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)",
"type": "VIEW"
},
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view_has_no_fields,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json
index f8e2565e492e15..aebc89b609a08b 100644
--- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json
+++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json
@@ -206,8 +206,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json
index 32d4f7bc64ab4d..34bded3cf691e5 100644
--- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json
+++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json
@@ -158,8 +158,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/lookml/expected_output.json b/metadata-ingestion/tests/integration/lookml/expected_output.json
index cdf520cc23a30e..b53d5857f1d66b 100644
--- a/metadata-ingestion/tests/integration/lookml/expected_output.json
+++ b/metadata-ingestion/tests/integration/lookml/expected_output.json
@@ -21,8 +21,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)",
"type": "VIEW"
@@ -260,8 +260,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)",
"type": "VIEW"
@@ -478,8 +478,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)",
"type": "VIEW"
@@ -585,8 +585,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)",
"type": "VIEW"
@@ -692,8 +692,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)",
"type": "VIEW"
@@ -844,8 +844,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..autodetect_sql_name_based_on_view_name,PROD)",
"type": "VIEW"
@@ -951,8 +951,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)",
"type": "VIEW"
@@ -1058,8 +1058,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..fragment_derived_view,PROD)",
"type": "VIEW"
@@ -1240,8 +1240,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..order,PROD)",
"type": "VIEW"
@@ -1347,8 +1347,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.ecommerce.ability,PROD)",
"type": "VIEW"
@@ -1533,8 +1533,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)",
"type": "VIEW"
@@ -1732,8 +1732,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)",
"type": "VIEW"
@@ -1971,8 +1971,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.flightstats.accidents,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json
index 73edecbe622054..238f4c2580cdf2 100644
--- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json
+++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json
@@ -21,8 +21,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.my_table,PROD)",
"type": "VIEW"
@@ -260,8 +260,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)",
"type": "VIEW"
@@ -478,8 +478,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.include_able,PROD)",
"type": "VIEW"
@@ -585,8 +585,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.events,PROD)",
"type": "VIEW"
@@ -692,8 +692,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.events,PROD)",
"type": "VIEW"
@@ -844,8 +844,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.autodetect_sql_name_based_on_view_name,PROD)",
"type": "VIEW"
@@ -951,8 +951,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.include_able,PROD)",
"type": "VIEW"
@@ -1058,8 +1058,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.fragment_derived_view,PROD)",
"type": "VIEW"
@@ -1240,8 +1240,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.order,PROD)",
"type": "VIEW"
@@ -1347,8 +1347,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.ecommerce.ability,PROD)",
"type": "VIEW"
@@ -1533,8 +1533,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.owners,PROD)",
"type": "VIEW"
@@ -1732,8 +1732,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)",
"type": "VIEW"
@@ -1971,8 +1971,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.flightstats.accidents,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json
index 9aa6a952c40b4e..45d5d839e9d21c 100644
--- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json
+++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json
@@ -21,8 +21,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.my_table,PROD)",
"type": "VIEW"
@@ -260,8 +260,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)",
"type": "VIEW"
@@ -478,8 +478,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.include_able,PROD)",
"type": "VIEW"
@@ -585,8 +585,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.events,PROD)",
"type": "VIEW"
@@ -692,8 +692,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.events,PROD)",
"type": "VIEW"
@@ -844,8 +844,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.autodetect_sql_name_based_on_view_name,PROD)",
"type": "VIEW"
@@ -951,8 +951,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.include_able,PROD)",
"type": "VIEW"
@@ -1058,8 +1058,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.fragment_derived_view,PROD)",
"type": "VIEW"
@@ -1240,8 +1240,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.order,PROD)",
"type": "VIEW"
@@ -1347,8 +1347,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,ecommerce.ability,PROD)",
"type": "VIEW"
@@ -1533,8 +1533,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.owners,PROD)",
"type": "VIEW"
@@ -1732,8 +1732,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)",
"type": "VIEW"
@@ -1971,8 +1971,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,flightstats.accidents,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json
index 6ce6d809ae8f52..187cedaefb6b21 100644
--- a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json
+++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json
@@ -450,8 +450,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)",
"type": "VIEW"
@@ -557,8 +557,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)",
"type": "VIEW"
@@ -664,8 +664,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)",
"type": "VIEW"
@@ -816,8 +816,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)",
"type": "VIEW"
@@ -923,8 +923,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)",
"type": "VIEW"
@@ -1123,8 +1123,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)",
"type": "VIEW"
@@ -1230,8 +1230,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)",
"type": "VIEW"
@@ -1416,8 +1416,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)",
"type": "VIEW"
@@ -1615,8 +1615,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)",
"type": "VIEW"
@@ -1854,8 +1854,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json
index 1016d4e2114589..a3231186669408 100644
--- a/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json
+++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json
@@ -21,8 +21,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)",
"type": "VIEW"
@@ -260,8 +260,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json
index fc91c97a530037..c2c879e38f37bb 100644
--- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json
+++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json
@@ -21,8 +21,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD)",
"type": "VIEW"
@@ -260,8 +260,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)",
"type": "VIEW"
@@ -478,8 +478,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)",
"type": "VIEW"
@@ -585,8 +585,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)",
"type": "VIEW"
@@ -692,8 +692,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)",
"type": "VIEW"
@@ -844,8 +844,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)",
"type": "VIEW"
@@ -951,8 +951,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)",
"type": "VIEW"
@@ -1058,8 +1058,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD)",
"type": "VIEW"
@@ -1240,8 +1240,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)",
"type": "VIEW"
@@ -1347,8 +1347,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)",
"type": "VIEW"
@@ -1533,8 +1533,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)",
"type": "VIEW"
@@ -1732,8 +1732,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)",
"type": "VIEW"
@@ -1971,8 +1971,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json
index 8635a570c06210..c1ac54b0fb588d 100644
--- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json
+++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json
@@ -21,8 +21,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)",
"type": "VIEW"
@@ -128,8 +128,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)",
"type": "VIEW"
@@ -235,8 +235,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)",
"type": "VIEW"
@@ -387,8 +387,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)",
"type": "VIEW"
@@ -494,8 +494,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json
index 19168aa3231429..f602ca37b31607 100644
--- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json
+++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json
@@ -21,8 +21,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV)",
"type": "VIEW"
@@ -260,8 +260,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)",
"type": "VIEW"
@@ -478,8 +478,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.include_able,DEV)",
"type": "VIEW"
@@ -585,8 +585,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.events,DEV)",
"type": "VIEW"
@@ -692,8 +692,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.events,DEV)",
"type": "VIEW"
@@ -844,8 +844,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.autodetect_sql_name_based_on_view_name,DEV)",
"type": "VIEW"
@@ -951,8 +951,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.include_able,DEV)",
"type": "VIEW"
@@ -1058,8 +1058,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.fragment_derived_view,DEV)",
"type": "VIEW"
@@ -1240,8 +1240,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.order,DEV)",
"type": "VIEW"
@@ -1347,8 +1347,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.ecommerce.ability,DEV)",
"type": "VIEW"
@@ -1533,8 +1533,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.owners,DEV)",
"type": "VIEW"
@@ -1732,8 +1732,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)",
"type": "VIEW"
@@ -1971,8 +1971,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.flightstats.accidents,DEV)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json
index d4ced76a7475d4..104bd365669e34 100644
--- a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json
+++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json
@@ -21,8 +21,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD)",
"type": "VIEW"
@@ -261,8 +261,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)",
"type": "VIEW"
@@ -480,8 +480,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)",
"type": "VIEW"
@@ -588,8 +588,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)",
"type": "VIEW"
@@ -696,8 +696,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)",
"type": "VIEW"
@@ -849,8 +849,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)",
"type": "VIEW"
@@ -957,8 +957,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)",
"type": "VIEW"
@@ -1065,8 +1065,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD)",
"type": "VIEW"
@@ -1248,8 +1248,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)",
"type": "VIEW"
@@ -1356,8 +1356,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)",
"type": "VIEW"
@@ -1543,8 +1543,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)",
"type": "VIEW"
@@ -1743,8 +1743,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)",
"type": "VIEW"
@@ -1983,8 +1983,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json b/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json
index 2bae6452145df3..37a6c94c6952e3 100644
--- a/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json
+++ b/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json
@@ -21,8 +21,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV)",
"type": "VIEW"
@@ -260,8 +260,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.owners,DEV)",
"type": "VIEW"
@@ -459,8 +459,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,rs_warehouse.default_db.default_schema.my_table,DEV)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json b/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json
index a5c316f365d4b6..49831ee554ab16 100644
--- a/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json
+++ b/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json
@@ -21,8 +21,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.book,PROD)",
"type": "VIEW"
@@ -303,8 +303,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.book,PROD)",
"type": "VIEW"
@@ -410,8 +410,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.order,PROD)",
"type": "VIEW"
@@ -607,8 +607,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.issue_history,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json
index de303d50e7acdf..dc5e1aa9096f84 100644
--- a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json
+++ b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json
@@ -21,8 +21,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)",
"type": "VIEW"
@@ -260,8 +260,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)",
"type": "VIEW"
@@ -478,8 +478,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)",
"type": "VIEW"
@@ -585,8 +585,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)",
"type": "VIEW"
@@ -692,8 +692,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)",
"type": "VIEW"
@@ -844,8 +844,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..autodetect_sql_name_based_on_view_name,PROD)",
"type": "VIEW"
@@ -951,8 +951,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)",
"type": "VIEW"
@@ -1058,8 +1058,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..fragment_derived_view,PROD)",
"type": "VIEW"
@@ -1240,8 +1240,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..order,PROD)",
"type": "VIEW"
@@ -1347,8 +1347,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.ecommerce.ability,PROD)",
"type": "VIEW"
@@ -1533,8 +1533,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)",
"type": "VIEW"
@@ -1764,8 +1764,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)",
"type": "VIEW"
@@ -2003,8 +2003,8 @@
"upstreams": [
{
"auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
+ "time": 1586847600000,
+ "actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.flightstats.accidents,PROD)",
"type": "VIEW"
diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py
index 5c9553402a8c47..e77a12aa4088e4 100644
--- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py
+++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py
@@ -1,17 +1,22 @@
import logging
import sys
-from typing import List
+from typing import List, Tuple
import pytest
from lark import Tree
import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes
-from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport
-from datahub.ingestion.source.powerbi.m_query import parser, tree_function
-from datahub.ingestion.source.powerbi.m_query.resolver import (
- DataPlatformTable,
- SupportedDataPlatform,
+from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.source.powerbi.config import (
+ PowerBiDashboardSourceConfig,
+ PowerBiDashboardSourceReport,
+)
+from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
+ AbstractDataPlatformInstanceResolver,
+ create_dataplatform_instance_resolver,
)
+from datahub.ingestion.source.powerbi.m_query import parser, tree_function
+from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable
M_QUERIES = [
'let\n Source = Snowflake.Databases("bu10758.ap-unknown-2.fakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table',
@@ -38,9 +43,31 @@
'let\n Source = AmazonRedshift.Database("redshift-url","dev"),\n public = Source{[Name="public"]}[Data],\n category1 = public{[Name="category"]}[Data]\nin\n category1',
'let\n Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true]) \n in Source',
'let\n Source = Databricks.Catalogs("adb-123.azuredatabricks.net", "/sql/1.0/endpoints/12345dc91aa25844", [Catalog=null, Database=null]),\n hive_metastore_Database = Source{[Name="hive_metastore",Kind="Database"]}[Data],\n sandbox_revenue_Schema = hive_metastore_Database{[Name="sandbox_revenue",Kind="Schema"]}[Data],\n public_consumer_price_index_Table = sandbox_revenue_Schema{[Name="public_consumer_price_index",Kind="Table"]}[Data],\n #"Renamed Columns" = Table.RenameColumns(public_consumer_price_index_Table,{{"Country", "country"}, {"Metric", "metric"}}),\n #"Inserted Year" = Table.AddColumn(#"Renamed Columns", "ID", each Date.Year([date_id]) + Date.Month([date_id]), Text.Type),\n #"Added Custom" = Table.AddColumn(#"Inserted Year", "Custom", each Text.Combine({Number.ToText(Date.Year([date_id])), Number.ToText(Date.Month([date_id])), [country]})),\n #"Removed Columns" = Table.RemoveColumns(#"Added Custom",{"ID"}),\n #"Renamed Columns1" = Table.RenameColumns(#"Removed Columns",{{"Custom", "ID"}}),\n #"Filtered Rows" = Table.SelectRows(#"Renamed Columns1", each ([metric] = "Consumer Price Index") and (not Number.IsNaN([value])))\nin\n #"Filtered Rows"',
+ "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu10758.ap-unknown-2.fakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS inner join OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT #(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source",
]
+def get_default_instances(
+ override_config: dict = {},
+) -> Tuple[
+ PipelineContext, PowerBiDashboardSourceConfig, AbstractDataPlatformInstanceResolver
+]:
+ config: PowerBiDashboardSourceConfig = PowerBiDashboardSourceConfig.parse_obj(
+ {
+ "tenant_id": "fake",
+ "client_id": "foo",
+ "client_secret": "bar",
+ **override_config,
+ }
+ )
+
+ platform_instance_resolver: AbstractDataPlatformInstanceResolver = (
+ create_dataplatform_instance_resolver(config)
+ )
+
+ return PipelineContext(run_id="fake"), config, platform_instance_resolver
+
+
@pytest.mark.integration
def test_parse_m_query1():
expression: str = M_QUERIES[0]
@@ -145,20 +172,20 @@ def test_snowflake_regular_case():
reporter = PowerBiDashboardSourceReport()
+ ctx, config, platform_instance_resolver = get_default_instances()
+
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
- table, reporter
+ table,
+ reporter,
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
assert len(data_platform_tables) == 1
- assert data_platform_tables[0].name == "TESTTABLE"
- assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE"
assert (
- data_platform_tables[0].datasource_server
- == "bu10758.ap-unknown-2.fakecomputing.com"
- )
- assert (
- data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name
+ data_platform_tables[0].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:snowflake,pbi_test.test.testtable,PROD)"
)
@@ -174,17 +201,21 @@ def test_postgres_regular_case():
)
reporter = PowerBiDashboardSourceReport()
+
+ ctx, config, platform_instance_resolver = get_default_instances()
+
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
- table, reporter
+ table,
+ reporter,
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
assert len(data_platform_tables) == 1
- assert data_platform_tables[0].name == "order_date"
- assert data_platform_tables[0].full_name == "mics.public.order_date"
- assert data_platform_tables[0].datasource_server == "localhost"
assert (
- data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.POSTGRES_SQL.value.powerbi_data_platform_name
+ data_platform_tables[0].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)"
)
@@ -200,19 +231,21 @@ def test_databricks_regular_case():
)
reporter = PowerBiDashboardSourceReport()
+
+ ctx, config, platform_instance_resolver = get_default_instances()
+
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
- table, reporter
+ table,
+ reporter,
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
assert len(data_platform_tables) == 1
- assert data_platform_tables[0].name == "public_consumer_price_index"
assert (
- data_platform_tables[0].full_name
- == "hive_metastore.sandbox_revenue.public_consumer_price_index"
- )
- assert (
- data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.DATABRICK_SQL.value.powerbi_data_platform_name
+ data_platform_tables[0].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:databricks,hive_metastore.sandbox_revenue.public_consumer_price_index,PROD)"
)
@@ -228,17 +261,21 @@ def test_oracle_regular_case():
)
reporter = PowerBiDashboardSourceReport()
+
+ ctx, config, platform_instance_resolver = get_default_instances()
+
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
- table, reporter
+ table,
+ reporter,
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
assert len(data_platform_tables) == 1
- assert data_platform_tables[0].name == "EMPLOYEES"
- assert data_platform_tables[0].full_name == "salesdb.HR.EMPLOYEES"
- assert data_platform_tables[0].datasource_server == "localhost:1521"
assert (
- data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.ORACLE.value.powerbi_data_platform_name
+ data_platform_tables[0].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:oracle,salesdb.hr.employees,PROD)"
)
@@ -255,17 +292,20 @@ def test_mssql_regular_case():
reporter = PowerBiDashboardSourceReport()
+ ctx, config, platform_instance_resolver = get_default_instances()
+
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
- table, reporter
+ table,
+ reporter,
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
assert len(data_platform_tables) == 1
- assert data_platform_tables[0].name == "book_issue"
- assert data_platform_tables[0].full_name == "library.dbo.book_issue"
- assert data_platform_tables[0].datasource_server == "localhost"
assert (
- data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name
+ data_platform_tables[0].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:mssql,library.dbo.book_issue,PROD)"
)
@@ -280,14 +320,16 @@ def test_mssql_with_query():
M_QUERIES[11],
]
expected_tables = [
- "COMMOPSDB.dbo.V_OIP_ENT_2022",
- "COMMOPSDB.dbo.V_INVOICE_BOOKING_2022",
- "COMMOPSDB.dbo.V_ARR_ADDS",
- "COMMOPSDB.dbo.V_PS_CD_RETENTION",
- "COMMOPSDB.dbo.V_TPV_LEADERBOARD",
- "COMMOPSDB.dbo.V_ENTERPRISE_INVOICED_REVENUE",
+ "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_oip_ent_2022,PROD)",
+ "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_invoice_booking_2022,PROD)",
+ "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_arr_adds,PROD)",
+ "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_ps_cd_retention,PROD)",
+ "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_tpv_leaderboard,PROD)",
+ "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_enterprise_invoiced_revenue,PROD)",
]
+ ctx, config, platform_instance_resolver = get_default_instances()
+
for index, query in enumerate(mssql_queries):
table: powerbi_data_classes.Table = powerbi_data_classes.Table(
columns=[],
@@ -299,17 +341,15 @@ def test_mssql_with_query():
reporter = PowerBiDashboardSourceReport()
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
- table, reporter, native_query_enabled=False
+ table,
+ reporter,
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
assert len(data_platform_tables) == 1
- assert data_platform_tables[0].name == expected_tables[index].split(".")[2]
- assert data_platform_tables[0].full_name == expected_tables[index]
- assert data_platform_tables[0].datasource_server == "AUPRDWHDB"
- assert (
- data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name
- )
+ assert data_platform_tables[0].urn == expected_tables[index]
@pytest.mark.integration
@@ -322,12 +362,14 @@ def test_snowflake_native_query():
]
expected_tables = [
- "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4",
- "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS",
- "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS",
- "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS",
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_aps_sme_units_v4,PROD)",
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)",
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)",
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)",
]
+ ctx, config, platform_instance_resolver = get_default_instances()
+
for index, query in enumerate(snowflake_queries):
table: powerbi_data_classes.Table = powerbi_data_classes.Table(
columns=[],
@@ -339,20 +381,15 @@ def test_snowflake_native_query():
reporter = PowerBiDashboardSourceReport()
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
- table, reporter
+ table,
+ reporter,
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
assert len(data_platform_tables) == 1
- assert data_platform_tables[0].name == expected_tables[index].split(".")[2]
- assert data_platform_tables[0].full_name == expected_tables[index]
- assert (
- data_platform_tables[0].datasource_server
- == "bu10758.ap-unknown-2.fakecomputing.com"
- )
- assert (
- data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name
- )
+ assert data_platform_tables[0].urn == expected_tables[index]
def test_google_bigquery_1():
@@ -363,16 +400,20 @@ def test_google_bigquery_1():
)
reporter = PowerBiDashboardSourceReport()
+ ctx, config, platform_instance_resolver = get_default_instances()
+
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
- table, reporter, native_query_enabled=False
+ table,
+ reporter,
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
+
assert len(data_platform_tables) == 1
- assert data_platform_tables[0].name == table.full_name.split(".")[2]
- assert data_platform_tables[0].full_name == table.full_name
- assert data_platform_tables[0].datasource_server == "seraphic-music-344307"
assert (
- data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name
+ data_platform_tables[0].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:bigquery,seraphic-music-344307.school_dataset.first,PROD)"
)
@@ -387,23 +428,24 @@ def test_google_bigquery_2():
)
reporter = PowerBiDashboardSourceReport()
+ ctx, config, platform_instance_resolver = get_default_instances()
+
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
table,
reporter,
- native_query_enabled=False,
parameters={
"Parameter - Source": "my-test-project",
"My bq project": "gcp_billing",
},
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
assert len(data_platform_tables) == 1
- assert data_platform_tables[0].name == table.full_name.split(".")[2]
- assert data_platform_tables[0].full_name == table.full_name
- assert data_platform_tables[0].datasource_server == "my-test-project"
assert (
- data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name
+ data_platform_tables[0].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-test-project.gcp_billing.gcp_table,PROD)"
)
@@ -416,23 +458,24 @@ def test_for_each_expression_1():
reporter = PowerBiDashboardSourceReport()
+ ctx, config, platform_instance_resolver = get_default_instances()
+
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
table,
reporter,
- native_query_enabled=False,
parameters={
"Parameter - Source": "my-test-project",
"My bq project": "gcp_billing",
},
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
assert len(data_platform_tables) == 1
- assert data_platform_tables[0].name == table.full_name.split(".")[2]
- assert data_platform_tables[0].datasource_server == "my-test-project"
- assert data_platform_tables[0].full_name == table.full_name
assert (
- data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name
+ data_platform_tables[0].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-test-project.universal.d_wh_date,PROD)"
)
@@ -445,22 +488,23 @@ def test_for_each_expression_2():
reporter = PowerBiDashboardSourceReport()
+ ctx, config, platform_instance_resolver = get_default_instances()
+
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
table,
reporter,
- native_query_enabled=False,
parameters={
"dwh-prod": "originally-not-a-variable-ref-and-not-resolved",
},
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
assert len(data_platform_tables) == 1
- assert data_platform_tables[0].name == table.full_name.split(".")[2]
- assert data_platform_tables[0].full_name == table.full_name
- assert data_platform_tables[0].datasource_server == "dwh-prod"
assert (
- data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name
+ data_platform_tables[0].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:bigquery,dwh-prod.gcp_billing.d_gcp_custom_label,PROD)"
)
@@ -476,8 +520,14 @@ def test_native_query_disabled():
reporter = PowerBiDashboardSourceReport()
+ ctx, config, platform_instance_resolver = get_default_instances()
+ config.native_query_parsing = False
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
- table, reporter, native_query_enabled=False
+ table,
+ reporter,
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
assert len(data_platform_tables) == 0
@@ -493,26 +543,25 @@ def test_multi_source_table():
)
reporter = PowerBiDashboardSourceReport()
+
+ ctx, config, platform_instance_resolver = get_default_instances()
+
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
- table, reporter, native_query_enabled=False
+ table,
+ reporter,
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
assert len(data_platform_tables) == 2
- assert data_platform_tables[0].full_name == "mics.public.order_date"
- assert data_platform_tables[0].datasource_server == "localhost"
- assert (
- data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.POSTGRES_SQL.value.powerbi_data_platform_name
- )
-
- assert data_platform_tables[1].full_name == "GSL_TEST_DB.PUBLIC.SALES_ANALYST_VIEW"
assert (
- data_platform_tables[1].datasource_server
- == "ghh48144.snowflakefakecomputing.com"
+ data_platform_tables[0].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)"
)
assert (
- data_platform_tables[1].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name
+ data_platform_tables[1].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst_view,PROD)"
)
@@ -521,36 +570,33 @@ def test_table_combine():
table: powerbi_data_classes.Table = powerbi_data_classes.Table(
columns=[],
measures=[],
- expression=M_QUERIES[16], # 1st index has the native query
+ expression=M_QUERIES[16],
name="virtual_order_table",
full_name="OrderDataSet.virtual_order_table",
)
reporter = PowerBiDashboardSourceReport()
+ ctx, config, platform_instance_resolver = get_default_instances()
+
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
- table, reporter
+ table,
+ reporter,
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
assert len(data_platform_tables) == 2
- assert data_platform_tables[0].full_name == "GSL_TEST_DB.PUBLIC.SALES_FORECAST"
- assert (
- data_platform_tables[0].datasource_server
- == "ghh48144.snowflakefakecomputing.com"
- )
- assert (
- data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name
- )
- assert data_platform_tables[1].full_name == "GSL_TEST_DB.PUBLIC.SALES_ANALYST"
assert (
- data_platform_tables[1].datasource_server
- == "ghh48144.snowflakefakecomputing.com"
+ data_platform_tables[0].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_forecast,PROD)"
)
+
assert (
- data_platform_tables[1].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name
+ data_platform_tables[1].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst,PROD)"
)
@@ -574,8 +620,14 @@ def test_expression_is_none():
reporter = PowerBiDashboardSourceReport()
+ ctx, config, platform_instance_resolver = get_default_instances()
+
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
- table, reporter
+ table,
+ reporter,
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
assert len(data_platform_tables) == 0
@@ -589,15 +641,20 @@ def test_redshift_regular_case():
)
reporter = PowerBiDashboardSourceReport()
+ ctx, config, platform_instance_resolver = get_default_instances()
+
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
- table, reporter, native_query_enabled=False
+ table,
+ reporter,
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
+
assert len(data_platform_tables) == 1
- assert data_platform_tables[0].name == table.full_name.split(".")[2]
- assert data_platform_tables[0].full_name == table.full_name
assert (
- data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name
+ data_platform_tables[0].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.category,PROD)"
)
@@ -609,13 +666,60 @@ def test_redshift_native_query():
)
reporter = PowerBiDashboardSourceReport()
+ ctx, config, platform_instance_resolver = get_default_instances()
+
+ config.native_query_parsing = True
+
data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
- table, reporter, native_query_enabled=True
+ table,
+ reporter,
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
)
+
assert len(data_platform_tables) == 1
- assert data_platform_tables[0].name == table.full_name.split(".")[2]
- assert data_platform_tables[0].full_name == table.full_name
assert (
- data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
- == SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name
+ data_platform_tables[0].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.category,PROD)"
+ )
+
+
+def test_sqlglot_parser():
+ table: powerbi_data_classes.Table = powerbi_data_classes.Table(
+ expression=M_QUERIES[24],
+ name="SALES_TARGET",
+ full_name="dev.public.sales",
+ )
+ reporter = PowerBiDashboardSourceReport()
+
+ ctx, config, platform_instance_resolver = get_default_instances(
+ override_config={
+ "server_to_platform_instance": {
+ "bu10758.ap-unknown-2.fakecomputing.com": {
+ "platform_instance": "sales_deployment",
+ "env": "PROD",
+ }
+ },
+ "native_query_parsing": True,
+ "enable_advance_lineage_sql_construct": True,
+ }
+ )
+
+ data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
+ table,
+ reporter,
+ ctx=ctx,
+ config=config,
+ platform_instance_resolver=platform_instance_resolver,
+ )
+
+ assert len(data_platform_tables) == 2
+ assert (
+ data_platform_tables[0].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:snowflake,sales_deployment.operations_analytics.transformed_prod.v_sme_unit,PROD)"
+ )
+ assert (
+ data_platform_tables[1].urn
+ == "urn:li:dataset:(urn:li:dataPlatform:snowflake,sales_deployment.operations_analytics.transformed_prod.v_sme_unit_targets,PROD)"
)
diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py
index 43f5e04fbc89fc..81e307a78ae9e6 100644
--- a/metadata-ingestion/tests/integration/snowflake/common.py
+++ b/metadata-ingestion/tests/integration/snowflake/common.py
@@ -434,11 +434,6 @@ def default_query_results( # noqa: C901
}
for op_idx in range(1, num_ops + 1)
]
- elif query == snowflake_query.SnowflakeQuery.external_table_lineage_history(
- 1654473600000,
- 1654586220000,
- ):
- return []
elif query in [
snowflake_query.SnowflakeQuery.view_dependencies(),
]:
@@ -509,10 +504,6 @@ def default_query_results( # noqa: C901
}
]
elif query in [
- snowflake_query.SnowflakeQuery.external_table_lineage_history(
- 1654473600000,
- 1654586220000,
- ),
snowflake_query.SnowflakeQuery.view_dependencies_v2(),
snowflake_query.SnowflakeQuery.view_dependencies(),
snowflake_query.SnowflakeQuery.show_external_tables(),
diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py
index 53b2bcb236cd9d..6135b0b3b32745 100644
--- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py
+++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py
@@ -121,7 +121,6 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
include_table_lineage=True,
include_view_lineage=True,
include_usage_stats=True,
- use_legacy_lineage_method=False,
validate_upstreams_against_patterns=False,
include_operational_stats=True,
email_as_user_identifier=True,
@@ -213,7 +212,6 @@ def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_
include_column_lineage=False,
include_views=False,
include_view_lineage=False,
- use_legacy_lineage_method=False,
include_usage_stats=False,
include_operational_stats=False,
start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace(
diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py
index 73a261bb3cb6e8..4963e71ae4d96f 100644
--- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py
+++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py
@@ -55,7 +55,6 @@ def snowflake_pipeline_config(tmp_path):
schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]),
include_view_lineage=False,
include_usage_stats=False,
- use_legacy_lineage_method=False,
start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace(
tzinfo=timezone.utc
),
diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py
deleted file mode 100644
index a5993793e574d7..00000000000000
--- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py
+++ /dev/null
@@ -1,291 +0,0 @@
-from datetime import datetime, timezone
-from typing import cast
-from unittest import mock
-
-from freezegun import freeze_time
-from pytest import fixture
-
-from datahub.configuration.common import AllowDenyPattern, DynamicTypedConfig
-from datahub.ingestion.run.pipeline import Pipeline
-from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig
-from datahub.ingestion.source.snowflake import snowflake_query
-from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config
-from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
-from tests.integration.snowflake.common import (
- FROZEN_TIME,
- NUM_TABLES,
- default_query_results,
-)
-
-
-def query_permission_error_override(fn, override_for_query, error_msg):
- def my_function(query):
- if query in override_for_query:
- raise Exception(error_msg)
- else:
- return fn(query)
-
- return my_function
-
-
-def query_permission_response_override(fn, override_for_query, response):
- def my_function(query):
- if query in override_for_query:
- return response
- else:
- return fn(query)
-
- return my_function
-
-
-@fixture(scope="function")
-def snowflake_pipeline_legacy_lineage_config(tmp_path):
- output_file = tmp_path / "snowflake_test_events_permission_error.json"
- config = PipelineConfig(
- source=SourceConfig(
- type="snowflake",
- config=SnowflakeV2Config(
- account_id="ABC12345.ap-south-1.aws",
- username="TST_USR",
- password="TST_PWD",
- role="TEST_ROLE",
- warehouse="TEST_WAREHOUSE",
- include_technical_schema=True,
- match_fully_qualified_names=True,
- schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]),
- include_view_lineage=False,
- include_usage_stats=False,
- use_legacy_lineage_method=True,
- start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace(
- tzinfo=timezone.utc
- ),
- end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace(tzinfo=timezone.utc),
- ),
- ),
- sink=DynamicTypedConfig(type="file", config={"filename": str(output_file)}),
- )
- return config
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_missing_role_access_causes_pipeline_failure(
- pytestconfig,
- snowflake_pipeline_legacy_lineage_config,
-):
- with mock.patch("snowflake.connector.connect") as mock_connect:
- # Snowflake connection fails role not granted error
- mock_connect.side_effect = Exception(
- "250001 (08001): Failed to connect to DB: abc12345.ap-south-1.snowflakecomputing.com:443. Role 'TEST_ROLE' specified in the connect string is not granted to this user. Contact your local system administrator, or attempt to login with another role, e.g. PUBLIC"
- )
-
- pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config)
- pipeline.run()
- assert "permission-error" in pipeline.source.get_report().failures.keys()
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_missing_warehouse_access_causes_pipeline_failure(
- pytestconfig,
- snowflake_pipeline_legacy_lineage_config,
-):
- with mock.patch("snowflake.connector.connect") as mock_connect:
- sf_connection = mock.MagicMock()
- sf_cursor = mock.MagicMock()
- mock_connect.return_value = sf_connection
- sf_connection.cursor.return_value = sf_cursor
-
- # Current warehouse query leads to blank result
- sf_cursor.execute.side_effect = query_permission_response_override(
- default_query_results,
- [SnowflakeQuery.current_warehouse()],
- [(None,)],
- )
- pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config)
- pipeline.run()
- assert "permission-error" in pipeline.source.get_report().failures.keys()
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_no_databases_with_access_causes_pipeline_failure(
- pytestconfig,
- snowflake_pipeline_legacy_lineage_config,
-):
- with mock.patch("snowflake.connector.connect") as mock_connect:
- sf_connection = mock.MagicMock()
- sf_cursor = mock.MagicMock()
- mock_connect.return_value = sf_connection
- sf_connection.cursor.return_value = sf_cursor
-
- # Error in listing databases
- sf_cursor.execute.side_effect = query_permission_error_override(
- default_query_results,
- [SnowflakeQuery.get_databases("TEST_DB")],
- "Database 'TEST_DB' does not exist or not authorized.",
- )
- pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config)
- pipeline.run()
- assert "permission-error" in pipeline.source.get_report().failures.keys()
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_no_tables_causes_pipeline_failure(
- pytestconfig,
- snowflake_pipeline_legacy_lineage_config,
-):
- with mock.patch("snowflake.connector.connect") as mock_connect:
- sf_connection = mock.MagicMock()
- sf_cursor = mock.MagicMock()
- mock_connect.return_value = sf_connection
- sf_connection.cursor.return_value = sf_cursor
-
- # Error in listing databases
- no_tables_fn = query_permission_response_override(
- default_query_results,
- [SnowflakeQuery.tables_for_schema("TEST_SCHEMA", "TEST_DB")],
- [],
- )
- sf_cursor.execute.side_effect = query_permission_response_override(
- no_tables_fn,
- [SnowflakeQuery.show_views_for_schema("TEST_SCHEMA", "TEST_DB")],
- [],
- )
-
- pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config)
- pipeline.run()
- assert "permission-error" in pipeline.source.get_report().failures.keys()
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_list_columns_error_causes_pipeline_warning(
- pytestconfig,
- snowflake_pipeline_legacy_lineage_config,
-):
- with mock.patch("snowflake.connector.connect") as mock_connect:
- sf_connection = mock.MagicMock()
- sf_cursor = mock.MagicMock()
- mock_connect.return_value = sf_connection
- sf_connection.cursor.return_value = sf_cursor
-
- # Error in listing columns
- sf_cursor.execute.side_effect = query_permission_error_override(
- default_query_results,
- [
- SnowflakeQuery.columns_for_table(
- "TABLE_{}".format(tbl_idx), "TEST_SCHEMA", "TEST_DB"
- )
- for tbl_idx in range(1, NUM_TABLES + 1)
- ],
- "Database 'TEST_DB' does not exist or not authorized.",
- )
- pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config)
- pipeline.run()
- pipeline.raise_from_status() # pipeline should not fail
- assert (
- "Failed to get columns for table"
- in pipeline.source.get_report().warnings.keys()
- )
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_list_primary_keys_error_causes_pipeline_warning(
- pytestconfig,
- snowflake_pipeline_legacy_lineage_config,
-):
- with mock.patch("snowflake.connector.connect") as mock_connect:
- sf_connection = mock.MagicMock()
- sf_cursor = mock.MagicMock()
- mock_connect.return_value = sf_connection
- sf_connection.cursor.return_value = sf_cursor
-
- # Error in listing keys leads to warning
- sf_cursor.execute.side_effect = query_permission_error_override(
- default_query_results,
- [SnowflakeQuery.show_primary_keys_for_schema("TEST_SCHEMA", "TEST_DB")],
- "Insufficient privileges to operate on TEST_DB",
- )
- pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config)
- pipeline.run()
- pipeline.raise_from_status() # pipeline should not fail
- assert (
- "Failed to get primary key for table"
- in pipeline.source.get_report().warnings.keys()
- )
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_missing_snowflake_lineage_permission_causes_pipeline_failure(
- pytestconfig,
- snowflake_pipeline_legacy_lineage_config,
-):
- with mock.patch("snowflake.connector.connect") as mock_connect:
- sf_connection = mock.MagicMock()
- sf_cursor = mock.MagicMock()
- mock_connect.return_value = sf_connection
- sf_connection.cursor.return_value = sf_cursor
-
- # Error in getting lineage
- sf_cursor.execute.side_effect = query_permission_error_override(
- default_query_results,
- [
- snowflake_query.SnowflakeQuery.table_to_table_lineage_history(
- 1654473600000, 1654586220000, True
- ),
- ],
- "Database 'SNOWFLAKE' does not exist or not authorized.",
- )
- pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config)
- pipeline.run()
- assert (
- "lineage-permission-error" in pipeline.source.get_report().failures.keys()
- )
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_missing_snowflake_operations_permission_causes_pipeline_failure(
- pytestconfig,
- snowflake_pipeline_legacy_lineage_config,
-):
- with mock.patch("snowflake.connector.connect") as mock_connect:
- sf_connection = mock.MagicMock()
- sf_cursor = mock.MagicMock()
- mock_connect.return_value = sf_connection
- sf_connection.cursor.return_value = sf_cursor
-
- # Error in getting access history date range
- sf_cursor.execute.side_effect = query_permission_error_override(
- default_query_results,
- [snowflake_query.SnowflakeQuery.get_access_history_date_range()],
- "Database 'SNOWFLAKE' does not exist or not authorized.",
- )
- pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config)
- pipeline.run()
- assert "usage-permission-error" in pipeline.source.get_report().failures.keys()
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_unexpected_snowflake_view_lineage_error_causes_pipeline_warning(
- pytestconfig,
- snowflake_pipeline_legacy_lineage_config,
-):
- with mock.patch("snowflake.connector.connect") as mock_connect:
- sf_connection = mock.MagicMock()
- sf_cursor = mock.MagicMock()
- mock_connect.return_value = sf_connection
- sf_connection.cursor.return_value = sf_cursor
-
- # Error in getting view lineage
- sf_cursor.execute.side_effect = query_permission_error_override(
- default_query_results,
- [snowflake_query.SnowflakeQuery.view_dependencies()],
- "Unexpected Error",
- )
-
- snowflake_pipeline_config1 = snowflake_pipeline_legacy_lineage_config.copy()
- cast(
- SnowflakeV2Config,
- cast(PipelineConfig, snowflake_pipeline_config1).source.config,
- ).include_view_lineage = True
- pipeline = Pipeline(snowflake_pipeline_config1)
- pipeline.run()
- pipeline.raise_from_status() # pipeline should not fail
- assert "view-upstream-lineage" in pipeline.source.get_report().warnings.keys()
diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py
deleted file mode 100644
index 59da7ddf695d8f..00000000000000
--- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py
+++ /dev/null
@@ -1,207 +0,0 @@
-import random
-from datetime import datetime, timezone
-from unittest import mock
-
-import pandas as pd
-import pytest
-from freezegun import freeze_time
-
-from datahub.configuration.common import AllowDenyPattern, DynamicTypedConfig
-from datahub.ingestion.glossary.classifier import (
- ClassificationConfig,
- DynamicTypedClassifierConfig,
-)
-from datahub.ingestion.glossary.datahub_classifier import (
- DataHubClassifierConfig,
- InfoTypeConfig,
- PredictionFactorsAndWeights,
- ValuesFactorConfig,
-)
-from datahub.ingestion.run.pipeline import Pipeline
-from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig
-from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
-from datahub.ingestion.source.snowflake.snowflake_config import (
- SnowflakeV2Config,
- TagOption,
-)
-from tests.integration.snowflake.common import FROZEN_TIME, default_query_results
-from tests.integration.snowflake.test_snowflake import random_cloud_region, random_email
-from tests.test_helpers import mce_helpers
-
-
-@pytest.mark.integration
-def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
- test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake"
-
- # Run the metadata ingestion pipeline.
- output_file = tmp_path / "snowflake_test_events.json"
- golden_file = test_resources_dir / "snowflake_golden.json"
-
- with mock.patch("snowflake.connector.connect") as mock_connect, mock.patch(
- "datahub.ingestion.source.snowflake.snowflake_v2.SnowflakeV2Source.get_sample_values_for_table"
- ) as mock_sample_values:
- sf_connection = mock.MagicMock()
- sf_cursor = mock.MagicMock()
- mock_connect.return_value = sf_connection
- sf_connection.cursor.return_value = sf_cursor
-
- sf_cursor.execute.side_effect = default_query_results
-
- mock_sample_values.return_value = pd.DataFrame(
- data={
- "col_1": [random.randint(1, 80) for i in range(20)],
- "col_2": [random_email() for i in range(20)],
- "col_3": [random_cloud_region() for i in range(20)],
- }
- )
-
- datahub_classifier_config = DataHubClassifierConfig(
- minimum_values_threshold=10,
- confidence_level_threshold=0.58,
- info_types_config={
- "Age": InfoTypeConfig(
- Prediction_Factors_and_Weights=PredictionFactorsAndWeights(
- Name=0, Values=1, Description=0, Datatype=0
- )
- ),
- "CloudRegion": InfoTypeConfig(
- Prediction_Factors_and_Weights=PredictionFactorsAndWeights(
- Name=0,
- Description=0,
- Datatype=0,
- Values=1,
- ),
- Values=ValuesFactorConfig(
- prediction_type="regex",
- regex=[
- r"(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\d+"
- ],
- ),
- ),
- },
- )
-
- pipeline = Pipeline(
- config=PipelineConfig(
- source=SourceConfig(
- type="snowflake",
- config=SnowflakeV2Config(
- account_id="ABC12345.ap-south-1.aws",
- username="TST_USR",
- password="TST_PWD",
- match_fully_qualified_names=True,
- schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]),
- include_technical_schema=True,
- include_table_lineage=True,
- include_view_lineage=True,
- include_usage_stats=True,
- use_legacy_lineage_method=True,
- validate_upstreams_against_patterns=False,
- include_operational_stats=True,
- start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace(
- tzinfo=timezone.utc
- ),
- end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace(
- tzinfo=timezone.utc
- ),
- classification=ClassificationConfig(
- enabled=True,
- classifiers=[
- DynamicTypedClassifierConfig(
- type="datahub", config=datahub_classifier_config
- )
- ],
- ),
- profiling=GEProfilingConfig(
- enabled=True,
- profile_if_updated_since_days=None,
- profile_table_row_limit=None,
- profile_table_size_limit=None,
- profile_table_level_only=True,
- ),
- extract_tags=TagOption.without_lineage,
- ),
- ),
- sink=DynamicTypedConfig(
- type="file", config={"filename": str(output_file)}
- ),
- )
- )
- pipeline.run()
- pipeline.pretty_print_summary()
- pipeline.raise_from_status()
-
- # Verify the output.
-
- mce_helpers.check_golden_file(
- pytestconfig,
- output_path=output_file,
- golden_path=golden_file,
- ignore_paths=[
- r"root\[\d+\]\['aspect'\]\['json'\]\['timestampMillis'\]",
- r"root\[\d+\]\['aspect'\]\['json'\]\['created'\]",
- r"root\[\d+\]\['aspect'\]\['json'\]\['lastModified'\]",
- r"root\[\d+\]\['aspect'\]\['json'\]\['fields'\]\[\d+\]\['glossaryTerms'\]\['auditStamp'\]\['time'\]",
- r"root\[\d+\]\['systemMetadata'\]",
- ],
- )
-
-
-@freeze_time(FROZEN_TIME)
-@pytest.mark.integration
-def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
- test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake"
-
- # Run the metadata ingestion pipeline.
- output_file = tmp_path / "snowflake_privatelink_test_events.json"
- golden_file = test_resources_dir / "snowflake_privatelink_golden.json"
-
- with mock.patch("snowflake.connector.connect") as mock_connect:
- sf_connection = mock.MagicMock()
- sf_cursor = mock.MagicMock()
- mock_connect.return_value = sf_connection
- sf_connection.cursor.return_value = sf_cursor
- sf_cursor.execute.side_effect = default_query_results
-
- pipeline = Pipeline(
- config=PipelineConfig(
- source=SourceConfig(
- type="snowflake",
- config=SnowflakeV2Config(
- account_id="ABC12345.ap-south-1.privatelink",
- username="TST_USR",
- password="TST_PWD",
- schema_pattern=AllowDenyPattern(allow=["test_schema"]),
- include_technical_schema=True,
- include_table_lineage=True,
- include_column_lineage=False,
- include_views=False,
- include_view_lineage=False,
- use_legacy_lineage_method=True,
- include_usage_stats=False,
- include_operational_stats=False,
- start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace(
- tzinfo=timezone.utc
- ),
- end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace(
- tzinfo=timezone.utc
- ),
- ),
- ),
- sink=DynamicTypedConfig(
- type="file", config={"filename": str(output_file)}
- ),
- )
- )
- pipeline.run()
- pipeline.pretty_print_summary()
- pipeline.raise_from_status()
-
- # Verify the output.
-
- mce_helpers.check_golden_file(
- pytestconfig,
- output_path=output_file,
- golden_path=golden_file,
- ignore_paths=[],
- )
diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json
index be4ae9e047aea0..67a563baa561cd 100644
--- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json
+++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json
@@ -66,6 +66,70 @@
"runId": "mssql-test"
}
},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataFlowInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "externalUrl": "",
+ "name": "localhost.Weekly Demo Data Backup"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362",
+ "job_name": "Weekly Demo Data Backup",
+ "description": "No description available.",
+ "date_created": "2023-03-10 16:27:54.970000",
+ "date_modified": "2023-03-10 16:27:55.097000",
+ "step_id": "1",
+ "step_name": "Set database to read only",
+ "subsystem": "TSQL",
+ "command": "ALTER DATABASE DemoData SET READ_ONLY"
+ },
+ "externalUrl": "",
+ "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup",
+ "type": {
+ "string": "MSSQL_JOB_STEP"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInputOutput",
+ "aspect": {
+ "json": {
+ "inputDatasets": [],
+ "outputDatasets": [],
+ "inputDatajobs": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
{
"entityType": "container",
"entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5",
@@ -1740,6 +1804,68 @@
"runId": "mssql-test"
}
},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataFlowInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "externalUrl": "",
+ "name": "localhost.demodata.Foo.stored_procedures"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "procedure_depends_on": "{}",
+ "depending_on_procedure": "{}",
+ "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n",
+ "input parameters": "['@ID']",
+ "parameter @ID": "{'type': 'int'}",
+ "date_created": "2023-03-10 16:27:54.907000",
+ "date_modified": "2023-03-10 16:27:54.907000"
+ },
+ "externalUrl": "",
+ "name": "demodata.Foo.DBs",
+ "type": {
+ "string": "MSSQL_STORED_PROCEDURE"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInputOutput",
+ "aspect": {
+ "json": {
+ "inputDatasets": [],
+ "outputDatasets": [],
+ "inputDatajobs": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)",
@@ -3985,6 +4111,66 @@
"runId": "mssql-test"
}
},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
{
"entityType": "container",
"entityUrn": "urn:li:container:c6627af82d44de89492e1a9315ae9f4b",
diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json
index bc81ce96334325..ef6033dd919435 100644
--- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json
+++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json
@@ -66,6 +66,70 @@
"runId": "mssql-test"
}
},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataFlowInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "externalUrl": "",
+ "name": "localhost.Weekly Demo Data Backup"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362",
+ "job_name": "Weekly Demo Data Backup",
+ "description": "No description available.",
+ "date_created": "2023-03-10 16:27:54.970000",
+ "date_modified": "2023-03-10 16:27:55.097000",
+ "step_id": "1",
+ "step_name": "Set database to read only",
+ "subsystem": "TSQL",
+ "command": "ALTER DATABASE DemoData SET READ_ONLY"
+ },
+ "externalUrl": "",
+ "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup",
+ "type": {
+ "string": "MSSQL_JOB_STEP"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInputOutput",
+ "aspect": {
+ "json": {
+ "inputDatasets": [],
+ "outputDatasets": [],
+ "inputDatajobs": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
{
"entityType": "container",
"entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5",
@@ -1740,6 +1804,68 @@
"runId": "mssql-test"
}
},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataFlowInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "externalUrl": "",
+ "name": "localhost.demodata.Foo.stored_procedures"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "procedure_depends_on": "{}",
+ "depending_on_procedure": "{}",
+ "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n",
+ "input parameters": "['@ID']",
+ "parameter @ID": "{'type': 'int'}",
+ "date_created": "2023-03-10 16:27:54.907000",
+ "date_modified": "2023-03-10 16:27:54.907000"
+ },
+ "externalUrl": "",
+ "name": "demodata.Foo.DBs",
+ "type": {
+ "string": "MSSQL_STORED_PROCEDURE"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInputOutput",
+ "aspect": {
+ "json": {
+ "inputDatasets": [],
+ "outputDatasets": [],
+ "inputDatajobs": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)",
@@ -2053,6 +2179,66 @@
"runId": "mssql-test"
}
},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
{
"entityType": "container",
"entityUrn": "urn:li:container:3f157d8292fb473142f19e2250af537f",
diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json
index 8be2fe134dca1a..8098accebb424c 100644
--- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json
+++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json
@@ -66,6 +66,70 @@
"runId": "mssql-test"
}
},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataFlowInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "externalUrl": "",
+ "name": "localhost.Weekly Demo Data Backup"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362",
+ "job_name": "Weekly Demo Data Backup",
+ "description": "No description available.",
+ "date_created": "2023-03-10 16:27:54.970000",
+ "date_modified": "2023-03-10 16:27:55.097000",
+ "step_id": "1",
+ "step_name": "Set database to read only",
+ "subsystem": "TSQL",
+ "command": "ALTER DATABASE DemoData SET READ_ONLY"
+ },
+ "externalUrl": "",
+ "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup",
+ "type": {
+ "string": "MSSQL_JOB_STEP"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInputOutput",
+ "aspect": {
+ "json": {
+ "inputDatasets": [],
+ "outputDatasets": [],
+ "inputDatajobs": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
{
"entityType": "container",
"entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5",
@@ -1740,6 +1804,68 @@
"runId": "mssql-test"
}
},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataFlowInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "externalUrl": "",
+ "name": "localhost.demodata.Foo.stored_procedures"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "procedure_depends_on": "{}",
+ "depending_on_procedure": "{}",
+ "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n",
+ "input parameters": "['@ID']",
+ "parameter @ID": "{'type': 'int'}",
+ "date_created": "2023-03-10 16:27:54.907000",
+ "date_modified": "2023-03-10 16:27:54.907000"
+ },
+ "externalUrl": "",
+ "name": "demodata.Foo.DBs",
+ "type": {
+ "string": "MSSQL_STORED_PROCEDURE"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInputOutput",
+ "aspect": {
+ "json": {
+ "inputDatasets": [],
+ "outputDatasets": [],
+ "inputDatajobs": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoDataAlias.Foo.SalesReason,PROD)",
@@ -2053,6 +2179,66 @@
"runId": "mssql-test"
}
},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
{
"entityType": "container",
"entityUrn": "urn:li:container:3f157d8292fb473142f19e2250af537f",
diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json
index ba2ab7330fdeda..d32002fb5648cc 100644
--- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json
+++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json
@@ -81,6 +81,70 @@
"runId": "mssql-test"
}
},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataFlowInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "externalUrl": "",
+ "name": "localhost.Weekly Demo Data Backup"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "job_id": "b6a0c1e2-f90a-4c86-a226-bf7ca59ad79f",
+ "job_name": "Weekly Demo Data Backup",
+ "description": "No description available.",
+ "date_created": "2023-08-06 21:01:05.157000",
+ "date_modified": "2023-08-06 21:01:05.283000",
+ "step_id": "1",
+ "step_name": "Set database to read only",
+ "subsystem": "TSQL",
+ "command": "ALTER DATABASE DemoData SET READ_ONLY"
+ },
+ "externalUrl": "",
+ "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup",
+ "type": {
+ "string": "MSSQL_JOB_STEP"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInputOutput",
+ "aspect": {
+ "json": {
+ "inputDatasets": [],
+ "outputDatasets": [],
+ "inputDatajobs": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
{
"entityType": "container",
"entityUrn": "urn:li:container:f1b4c0e379c4b2e2e09a8ecd6c1b6dec",
@@ -1764,6 +1828,68 @@
"runId": "mssql-test"
}
},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataFlowInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "externalUrl": "",
+ "name": "localhost.demodata.Foo.stored_procedures"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "procedure_depends_on": "{}",
+ "depending_on_procedure": "{}",
+ "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n",
+ "input parameters": "['@ID']",
+ "parameter @ID": "{'type': 'int'}",
+ "date_created": "2023-08-06 21:01:05.093000",
+ "date_modified": "2023-08-06 21:01:05.093000"
+ },
+ "externalUrl": "",
+ "name": "demodata.Foo.DBs",
+ "type": {
+ "string": "MSSQL_STORED_PROCEDURE"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInputOutput",
+ "aspect": {
+ "json": {
+ "inputDatasets": [],
+ "outputDatasets": [],
+ "inputDatajobs": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
{
"entityType": "container",
"entityUrn": "urn:li:container:a6bea84fba7b05fb5d12630c8e6306ac",
@@ -2072,5 +2198,65 @@
"lastObserved": 1615443388097,
"runId": "mssql-test"
}
+},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1615443388097,
+ "runId": "mssql-test"
+ }
}
]
\ No newline at end of file
diff --git a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql
index 612de3eb1583ce..2ff46e249007a6 100644
--- a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql
+++ b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql
@@ -44,6 +44,10 @@ CREATE TABLE Foo.SalesReason
)
;
GO
+CREATE PROCEDURE Foo.DBs @ID INT
+AS
+ SELECT @ID AS ThatDB;
+GO
GO
EXEC sys.sp_addextendedproperty
@@ -59,5 +63,31 @@ EXEC sys.sp_addextendedproperty
@value = N'Description for column LastName of table Persons of schema Foo.',
@level0type = N'SCHEMA', @level0name = 'Foo',
@level1type = N'TABLE', @level1name = 'Persons',
-@level2type = N'COLUMN',@level2name = 'LastName';
-GO
\ No newline at end of file
+@level2type = N'COLUMN',@level2name = 'LastName';
+GO
+USE msdb ;
+GO
+EXEC dbo.sp_add_job
+ @job_name = N'Weekly Demo Data Backup' ;
+GO
+EXEC sp_add_jobstep
+ @job_name = N'Weekly Demo Data Backup',
+ @step_name = N'Set database to read only',
+ @database_name = N'DemoData',
+ @subsystem = N'TSQL',
+ @command = N'ALTER DATABASE DemoData SET READ_ONLY',
+ @retry_attempts = 5,
+ @retry_interval = 5 ;
+GO
+EXEC dbo.sp_add_schedule
+ @schedule_name = N'RunOnce',
+ @freq_type = 1,
+ @active_start_time = 233000 ;
+GO
+EXEC sp_attach_schedule
+ @job_name = N'Weekly Demo Data Backup',
+ @schedule_name = N'RunOnce';
+GO
+EXEC dbo.sp_add_jobserver
+ @job_name = N'Weekly Demo Data Backup'
+GO
diff --git a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py
index 3e7b75edd48781..099690fed34c27 100644
--- a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py
+++ b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py
@@ -50,4 +50,9 @@ def test_mssql_ingest(mssql_runner, pytestconfig, tmp_path, mock_time, config_fi
output_path=tmp_path / "mssql_mces.json",
golden_path=test_resources_dir
/ f"golden_files/golden_mces_{config_file.replace('yml','json')}",
+ ignore_paths=[
+ r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['job_id'\]",
+ r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['date_created'\]",
+ r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['date_modified'\]",
+ ],
)
diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
index d04c8d905b4397..71428a78479535 100644
--- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
+++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
@@ -791,11 +791,9 @@ def test_tableau_unsupported_csql(mock_datahub_graph):
database_override_map={"production database": "prod"}
)
- with mock.patch(
- "datahub.ingestion.source.tableau.sqlglot_lineage"
- ) as sqlglot_lineage:
+ with mock.patch("datahub.ingestion.source.tableau.sqlglot_l") as sqlglot_lineage:
- sqlglot_lineage.return_value = SqlParsingResult( # type:ignore
+ sqlglot_lineage.create_lineage_sql_parsed_result.return_value = SqlParsingResult( # type:ignore
in_tables=[
"urn:li:dataset:(urn:li:dataPlatform:bigquery,my_bigquery_project.invent_dw.userdetail,PROD)"
],
diff --git a/metadata-ingestion/tests/integration/vertica/docker-compose.yml b/metadata-ingestion/tests/integration/vertica/docker-compose.yml
index ddaf206f236cf5..84af5c32a60e30 100644
--- a/metadata-ingestion/tests/integration/vertica/docker-compose.yml
+++ b/metadata-ingestion/tests/integration/vertica/docker-compose.yml
@@ -1,6 +1,7 @@
version: "3.9"
services:
vertica:
+ platform: linux/amd64
environment:
APP_DB_USER: "dbadmin"
APP_DB_PASSWORD: "abc123"
@@ -18,6 +19,3 @@ services:
volumes:
vertica-data:
-
-
-
diff --git a/metadata-ingestion/tests/integration/vertica/test_vertica.py b/metadata-ingestion/tests/integration/vertica/test_vertica.py
index db8bfd247313b0..fe306d1d0b2b8b 100644
--- a/metadata-ingestion/tests/integration/vertica/test_vertica.py
+++ b/metadata-ingestion/tests/integration/vertica/test_vertica.py
@@ -58,6 +58,7 @@ def vertica_runner(docker_compose_runner, test_resources_dir):
# Test needs more work to be done , currently it is working fine.
@freeze_time(FROZEN_TIME)
+@pytest.mark.skip("Failing in CI, cmd failing with exit code 1")
@pytest.mark.integration
def test_vertica_ingest_with_db(vertica_runner, pytestconfig, tmp_path):
test_resources_dir = pytestconfig.rootpath / "tests/integration/vertica"
diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py
index fc8ca166b105ad..47418d9a989bb5 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_source.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_source.py
@@ -138,13 +138,12 @@ def test_get_dataplatform_instance_aspect_returns_project_id():
f"urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,{project_id})"
)
- config = BigQueryV2Config.parse_obj({})
+ config = BigQueryV2Config.parse_obj({"include_data_platform_instance": True})
source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
data_platform_instance = source.get_dataplatform_instance_aspect(
"urn:li:test", project_id
)
-
metadata = data_platform_instance.get_metadata()["metadata"]
assert data_platform_instance is not None
@@ -152,6 +151,20 @@ def test_get_dataplatform_instance_aspect_returns_project_id():
assert metadata.aspect.instance == expected_instance
+def test_get_dataplatform_instance_default_no_instance():
+ config = BigQueryV2Config.parse_obj({})
+ source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
+
+ data_platform_instance = source.get_dataplatform_instance_aspect(
+ "urn:li:test", "project_id"
+ )
+ metadata = data_platform_instance.get_metadata()["metadata"]
+
+ assert data_platform_instance is not None
+ assert metadata.aspectName == "dataPlatformInstance"
+ assert metadata.aspect.instance is None
+
+
@patch("google.cloud.bigquery.client.Client")
def test_get_projects_with_single_project_id(client_mock):
config = BigQueryV2Config.parse_obj({"project_id": "test-3"})
diff --git a/metadata-ingestion/tests/unit/test_snowflake_shares.py b/metadata-ingestion/tests/unit/test_snowflake_shares.py
new file mode 100644
index 00000000000000..7de86139baf39a
--- /dev/null
+++ b/metadata-ingestion/tests/unit/test_snowflake_shares.py
@@ -0,0 +1,348 @@
+from typing import List
+
+import pytest
+
+from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.source.snowflake.snowflake_config import (
+ DatabaseId,
+ SnowflakeShareConfig,
+ SnowflakeV2Config,
+)
+from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
+from datahub.ingestion.source.snowflake.snowflake_schema import (
+ SnowflakeDatabase,
+ SnowflakeSchema,
+)
+from datahub.ingestion.source.snowflake.snowflake_shares import SnowflakeSharesHandler
+from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
+from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
+from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeProposal
+
+
+@pytest.fixture(scope="module")
+def snowflake_databases() -> List[SnowflakeDatabase]:
+ return [
+ SnowflakeDatabase(
+ name="db1",
+ created=None,
+ comment=None,
+ last_altered=None,
+ schemas=[
+ SnowflakeSchema(
+ name="schema11",
+ created=None,
+ comment=None,
+ last_altered=None,
+ tables=["table111", "table112"],
+ views=["view111"],
+ ),
+ SnowflakeSchema(
+ name="schema12",
+ created=None,
+ comment=None,
+ last_altered=None,
+ tables=["table121", "table122"],
+ views=["view121"],
+ ),
+ ],
+ ),
+ SnowflakeDatabase(
+ name="db2",
+ created=None,
+ comment=None,
+ last_altered=None,
+ schemas=[
+ SnowflakeSchema(
+ name="schema21",
+ created=None,
+ comment=None,
+ last_altered=None,
+ tables=["table211", "table212"],
+ views=["view211"],
+ ),
+ SnowflakeSchema(
+ name="schema22",
+ created=None,
+ comment=None,
+ last_altered=None,
+ tables=["table221", "table222"],
+ views=["view221"],
+ ),
+ ],
+ ),
+ SnowflakeDatabase(
+ name="db3",
+ created=None,
+ comment=None,
+ last_altered=None,
+ schemas=[
+ SnowflakeSchema(
+ name="schema31",
+ created=None,
+ comment=None,
+ last_altered=None,
+ tables=["table311", "table312"],
+ views=["view311"],
+ )
+ ],
+ ),
+ ]
+
+
+def make_snowflake_urn(table_name, instance_name=None):
+ return make_dataset_urn_with_platform_instance(
+ "snowflake", table_name, instance_name
+ )
+
+
+def test_snowflake_shares_workunit_no_shares(
+ snowflake_databases: List[SnowflakeDatabase],
+) -> None:
+ config = SnowflakeV2Config(account_id="abc12345", platform_instance="instance1")
+
+ report = SnowflakeV2Report()
+ shares_handler = SnowflakeSharesHandler(
+ config, report, lambda x: make_snowflake_urn(x)
+ )
+
+ wus = list(shares_handler.get_shares_workunits(snowflake_databases))
+
+ assert len(wus) == 0
+
+
+def test_same_database_inbound_and_outbound_invalid_config() -> None:
+ with pytest.raises(
+ ValueError,
+ match="Same database can not be present as consumer in more than one share",
+ ):
+ SnowflakeV2Config(
+ account_id="abc12345",
+ platform_instance="instance1",
+ shares={
+ "share1": SnowflakeShareConfig(
+ database="db1",
+ platform_instance="instance2",
+ consumers=[
+ DatabaseId(database="db1", platform_instance="instance1")
+ ],
+ ),
+ "share2": SnowflakeShareConfig(
+ database="db1",
+ platform_instance="instance3",
+ consumers=[
+ DatabaseId(database="db1", platform_instance="instance1")
+ ],
+ ),
+ },
+ )
+
+ with pytest.raises(
+ ValueError,
+ match="Database included in a share can not be present as consumer in any share",
+ ):
+ SnowflakeV2Config(
+ account_id="abc12345",
+ platform_instance="instance1",
+ shares={
+ "share1": SnowflakeShareConfig(
+ database="db1",
+ platform_instance="instance2",
+ consumers=[
+ DatabaseId(database="db1", platform_instance="instance1")
+ ],
+ ),
+ "share2": SnowflakeShareConfig(
+ database="db1",
+ platform_instance="instance1",
+ consumers=[
+ DatabaseId(database="db1", platform_instance="instance3")
+ ],
+ ),
+ },
+ )
+
+ with pytest.raises(
+ ValueError,
+ match="Database included in a share can not be present as consumer in any share",
+ ):
+ SnowflakeV2Config(
+ account_id="abc12345",
+ platform_instance="instance1",
+ shares={
+ "share2": SnowflakeShareConfig(
+ database="db1",
+ platform_instance="instance1",
+ consumers=[
+ DatabaseId(database="db1", platform_instance="instance3")
+ ],
+ ),
+ "share1": SnowflakeShareConfig(
+ database="db1",
+ platform_instance="instance2",
+ consumers=[
+ DatabaseId(database="db1", platform_instance="instance1")
+ ],
+ ),
+ },
+ )
+
+
+def test_snowflake_shares_workunit_inbound_share(
+ snowflake_databases: List[SnowflakeDatabase],
+) -> None:
+ config = SnowflakeV2Config(
+ account_id="abc12345",
+ platform_instance="instance1",
+ shares={
+ "share1": SnowflakeShareConfig(
+ database="db1",
+ platform_instance="instance2",
+ consumers=[DatabaseId(database="db1", platform_instance="instance1")],
+ )
+ },
+ )
+
+ report = SnowflakeV2Report()
+ shares_handler = SnowflakeSharesHandler(
+ config, report, lambda x: make_snowflake_urn(x, "instance1")
+ )
+
+ wus = list(shares_handler.get_shares_workunits(snowflake_databases))
+
+ # 2 schemas - 2 tables and 1 view in each schema making total 6 datasets
+ # Hence 6 Sibling and 6 upstreamLineage aspects
+ assert len(wus) == 12
+ upstream_lineage_aspect_entity_urns = set()
+ sibling_aspect_entity_urns = set()
+
+ for wu in wus:
+ assert isinstance(
+ wu.metadata, (MetadataChangeProposal, MetadataChangeProposalWrapper)
+ )
+ if wu.metadata.aspectName == "upstreamLineage":
+ upstream_aspect = wu.get_aspect_of_type(UpstreamLineage)
+ assert upstream_aspect is not None
+ assert len(upstream_aspect.upstreams) == 1
+ assert upstream_aspect.upstreams[0].dataset == wu.get_urn().replace(
+ "instance1.db1", "instance2.db1"
+ )
+ upstream_lineage_aspect_entity_urns.add(wu.get_urn())
+ else:
+ siblings_aspect = wu.get_aspect_of_type(Siblings)
+ assert siblings_aspect is not None
+ assert len(siblings_aspect.siblings) == 1
+ assert siblings_aspect.siblings == [
+ wu.get_urn().replace("instance1.db1", "instance2.db1")
+ ]
+ sibling_aspect_entity_urns.add(wu.get_urn())
+
+ assert upstream_lineage_aspect_entity_urns == sibling_aspect_entity_urns
+
+
+def test_snowflake_shares_workunit_outbound_share(
+ snowflake_databases: List[SnowflakeDatabase],
+) -> None:
+ config = SnowflakeV2Config(
+ account_id="abc12345",
+ platform_instance="instance1",
+ shares={
+ "share2": SnowflakeShareConfig(
+ database="db2",
+ platform_instance="instance1",
+ consumers=[
+ DatabaseId(
+ database="db2_from_share", platform_instance="instance2"
+ ),
+ DatabaseId(database="db2", platform_instance="instance3"),
+ ],
+ )
+ },
+ )
+
+ report = SnowflakeV2Report()
+ shares_handler = SnowflakeSharesHandler(
+ config, report, lambda x: make_snowflake_urn(x, "instance1")
+ )
+
+ wus = list(shares_handler.get_shares_workunits(snowflake_databases))
+
+ # 2 schemas - 2 tables and 1 view in each schema making total 6 datasets
+ # Hence 6 Sibling aspects
+ assert len(wus) == 6
+ entity_urns = set()
+
+ for wu in wus:
+ siblings_aspect = wu.get_aspect_of_type(Siblings)
+ assert siblings_aspect is not None
+ assert len(siblings_aspect.siblings) == 2
+ assert siblings_aspect.siblings == [
+ wu.get_urn().replace("instance1.db2", "instance2.db2_from_share"),
+ wu.get_urn().replace("instance1.db2", "instance3.db2"),
+ ]
+ entity_urns.add(wu.get_urn())
+
+ assert len((entity_urns)) == 6
+
+
+def test_snowflake_shares_workunit_inbound_and_outbound_share(
+ snowflake_databases: List[SnowflakeDatabase],
+) -> None:
+ config = SnowflakeV2Config(
+ account_id="abc12345",
+ platform_instance="instance1",
+ shares={
+ "share1": SnowflakeShareConfig(
+ database="db1",
+ platform_instance="instance2",
+ consumers=[DatabaseId(database="db1", platform_instance="instance1")],
+ ),
+ "share2": SnowflakeShareConfig(
+ database="db2",
+ platform_instance="instance1",
+ consumers=[
+ DatabaseId(
+ database="db2_from_share", platform_instance="instance2"
+ ),
+ DatabaseId(database="db2", platform_instance="instance3"),
+ ],
+ ),
+ },
+ )
+
+ report = SnowflakeV2Report()
+ shares_handler = SnowflakeSharesHandler(
+ config, report, lambda x: make_snowflake_urn(x, "instance1")
+ )
+
+ wus = list(shares_handler.get_shares_workunits(snowflake_databases))
+
+ # 6 Sibling and 6 upstreamLineage aspects for db1 tables
+ # 6 Sibling aspects for db2 tables
+ assert len(wus) == 18
+
+ for wu in wus:
+ assert isinstance(
+ wu.metadata, (MetadataChangeProposal, MetadataChangeProposalWrapper)
+ )
+ if wu.metadata.aspectName == "upstreamLineage":
+ upstream_aspect = wu.get_aspect_of_type(UpstreamLineage)
+ assert upstream_aspect is not None
+ assert len(upstream_aspect.upstreams) == 1
+ assert upstream_aspect.upstreams[0].dataset == wu.get_urn().replace(
+ "instance1.db1", "instance2.db1"
+ )
+ else:
+ siblings_aspect = wu.get_aspect_of_type(Siblings)
+ assert siblings_aspect is not None
+ if "db1" in wu.get_urn():
+ assert len(siblings_aspect.siblings) == 1
+ assert siblings_aspect.siblings == [
+ wu.get_urn().replace("instance1.db1", "instance2.db1")
+ ]
+ else:
+ assert len(siblings_aspect.siblings) == 2
+ assert siblings_aspect.siblings == [
+ wu.get_urn().replace("instance1.db2", "instance2.db2_from_share"),
+ wu.get_urn().replace("instance1.db2", "instance3.db2"),
+ ]
diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle
index 025273fc9263ea..82273427974af9 100644
--- a/metadata-integration/java/datahub-client/build.gradle
+++ b/metadata-integration/java/datahub-client/build.gradle
@@ -235,3 +235,7 @@ sourceSets.main.java.srcDir "${generateOpenApiPojos.outputDir}/src/main/java"
sourceSets.main.resources.srcDir "${generateOpenApiPojos.outputDir}/src/main/resources"
checkstyleMain.exclude '**/generated/**'
+
+clean {
+ project.delete("$projectDir/generated")
+}
\ No newline at end of file
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java
index 555acb2ffdd3b6..4bbff3915aca93 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java
@@ -6,6 +6,7 @@
import com.linkedin.metadata.models.SearchableFieldSpec;
import com.linkedin.metadata.models.annotation.SearchableAnnotation.FieldType;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -42,6 +43,13 @@ public static Map getPartialNgramConfigWithOverrides(Map getMappingsForField(@Nonnull final Searchable
mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER);
// Add keyword subfield without lowercase filter
mappingForField.put(FIELDS, ImmutableMap.of(KEYWORD, KEYWORD_TYPE_MAP));
- } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL) {
+ } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) {
mappingForField.put(TYPE, KEYWORD);
mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER);
Map subFields = new HashMap<>();
- if (fieldType == FieldType.TEXT_PARTIAL) {
+ if (fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) {
subFields.put(NGRAM, getPartialNgramConfigWithOverrides(
ImmutableMap.of(
ANALYZER, PARTIAL_ANALYZER
)
));
+ if (fieldType == FieldType.WORD_GRAM) {
+ for (Map.Entry entry : Map.of(
+ WORD_GRAMS_LENGTH_2, WORD_GRAM_2_ANALYZER,
+ WORD_GRAMS_LENGTH_3, WORD_GRAM_3_ANALYZER,
+ WORD_GRAMS_LENGTH_4, WORD_GRAM_4_ANALYZER).entrySet()) {
+ String fieldName = entry.getKey();
+ String analyzerName = entry.getValue();
+ subFields.put(fieldName, ImmutableMap.of(
+ TYPE, TEXT,
+ ANALYZER, analyzerName,
+ SEARCH_ANALYZER, analyzerName
+ ));
+ }
+ }
}
subFields.put(DELIMITED, ImmutableMap.of(
TYPE, TEXT,
@@ -163,6 +185,7 @@ private static Map getMappingsForField(@Nonnull final Searchable
searchableFieldSpec.getSearchableAnnotation()
.getNumValuesFieldName()
.ifPresent(fieldName -> mappings.put(fieldName, ImmutableMap.of(TYPE, LONG)));
+ mappings.putAll(getMappingsForFieldNameAliases(searchableFieldSpec));
return mappings;
}
@@ -172,4 +195,16 @@ private static Map getMappingsForSearchScoreField(
return ImmutableMap.of(searchScoreFieldSpec.getSearchScoreAnnotation().getFieldName(),
ImmutableMap.of(TYPE, DOUBLE));
}
+
+ private static Map getMappingsForFieldNameAliases(@Nonnull final SearchableFieldSpec searchableFieldSpec) {
+ Map mappings = new HashMap<>();
+ List fieldNameAliases = searchableFieldSpec.getSearchableAnnotation().getFieldNameAliases();
+ fieldNameAliases.forEach(alias -> {
+ Map aliasMappings = new HashMap<>();
+ aliasMappings.put(TYPE, ALIAS);
+ aliasMappings.put(PATH, searchableFieldSpec.getSearchableAnnotation().getFieldName());
+ mappings.put(alias, aliasMappings);
+ });
+ return mappings;
+ }
}
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java
index 5b3e396837aa7d..e180c8296b48d4 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java
@@ -66,6 +66,9 @@ public class SettingsBuilder {
public static final String KEYWORD_ANALYZER = "keyword";
public static final String URN_ANALYZER = "urn_component";
public static final String URN_SEARCH_ANALYZER = "query_urn_component";
+ public static final String WORD_GRAM_2_ANALYZER = "word_gram_2";
+ public static final String WORD_GRAM_3_ANALYZER = "word_gram_3";
+ public static final String WORD_GRAM_4_ANALYZER = "word_gram_4";
// Filters
public static final String ALPHANUM_SPACE_ONLY = "alpha_num_space";
@@ -80,6 +83,10 @@ public class SettingsBuilder {
public static final String MULTIFILTER = "multifilter";
public static final String MULTIFILTER_GRAPH = "multifilter_graph";
public static final String PARTIAL_URN_COMPONENT = "partial_urn_component";
+ public static final String SHINGLE = "shingle";
+ public static final String WORD_GRAM_2_FILTER = "word_gram_2_filter";
+ public static final String WORD_GRAM_3_FILTER = "word_gram_3_filter";
+ public static final String WORD_GRAM_4_FILTER = "word_gram_4_filter";
public static final String SNOWBALL = "snowball";
public static final String STEM_OVERRIDE = "stem_override";
public static final String STOP = "stop";
@@ -108,6 +115,7 @@ public class SettingsBuilder {
public static final String SLASH_TOKENIZER = "slash_tokenizer";
public static final String UNIT_SEPARATOR_PATH_TOKENIZER = "unit_separator_path_tokenizer";
public static final String UNIT_SEPARATOR_TOKENIZER = "unit_separator_tokenizer";
+ public static final String WORD_GRAM_TOKENIZER = "word_gram_tokenizer";
// Do not remove the space, needed for multi-term synonyms
public static final List ALPHANUM_SPACE_PATTERNS = ImmutableList.of(
"([a-z0-9 _-]{2,})",
@@ -161,6 +169,13 @@ public class SettingsBuilder {
AUTOCOMPLETE_CUSTOM_DELIMITER,
LOWERCASE);
+ public static final List WORD_GRAM_TOKEN_FILTERS = ImmutableList.of(
+ ASCII_FOLDING,
+ LOWERCASE,
+ TRIM,
+ REMOVE_QUOTES
+ );
+
public final Map settings;
public SettingsBuilder(String mainTokenizer) {
@@ -275,6 +290,17 @@ private static Map buildFilters() throws IOException {
.collect(Collectors.toList()))
.build());
}
+
+ for (Map.Entry entry : Map.of(WORD_GRAM_2_FILTER, 2, WORD_GRAM_3_FILTER, 3, WORD_GRAM_4_FILTER, 4).entrySet()) {
+ String filterName = entry.getKey();
+ Integer gramSize = entry.getValue();
+ filters.put(filterName, ImmutableMap.builder()
+ .put(TYPE, SHINGLE)
+ .put("min_shingle_size", gramSize)
+ .put("max_shingle_size", gramSize)
+ .put("output_unigrams", false)
+ .build());
+ }
}
return filters.build();
@@ -302,13 +328,24 @@ private static Map buildTokenizers() {
.put(DELIMITER, "␟")
.build());
- // Tokenize by whitespace and most special chars
+ // Tokenize by most special chars
+ // Do NOT tokenize by whitespace to keep multi-word synonyms in the same token
+ // The split by whitespace is done later in the token filters phase
tokenizers.put(MAIN_TOKENIZER,
ImmutableMap.builder()
.put(TYPE, PATTERN)
.put(PATTERN, "[(),./:]")
.build());
+ // Tokenize by whitespace and most special chars for wordgrams
+ // only split on - when not preceded by a whitespace to preserve exclusion functionality
+ // i.e. "logging-events-bkcp" and "logging-events -bckp" should be handled differently
+ tokenizers.put(WORD_GRAM_TOKENIZER,
+ ImmutableMap.builder()
+ .put(TYPE, PATTERN)
+ .put(PATTERN, "[(),./:\\s_]|(?<=\\S)(-)")
+ .build());
+
return tokenizers.build();
}
@@ -382,6 +419,21 @@ private static Map buildAnalyzers(String mainTokenizer) {
.put(FILTER, SEARCH_TOKEN_FILTERS)
.build());
+ // Support word grams
+ for (Map.Entry entry : Map.of(
+ WORD_GRAM_2_ANALYZER, WORD_GRAM_2_FILTER,
+ WORD_GRAM_3_ANALYZER, WORD_GRAM_3_FILTER,
+ WORD_GRAM_4_ANALYZER, WORD_GRAM_4_FILTER).entrySet()) {
+ String analyzerName = entry.getKey();
+ String filterName = entry.getValue();
+ analyzers.put(analyzerName, ImmutableMap.builder()
+ .put(TOKENIZER, WORD_GRAM_TOKENIZER)
+ .put(FILTER, ImmutableList.builder()
+ .addAll(WORD_GRAM_TOKEN_FILTERS)
+ .add(filterName).build())
+ .build());
+ }
+
// For special analysis, the substitution can be read from the configuration (chinese tokenizer: ik_smart / smartCN)
// Analyzer for partial matching (i.e. autocomplete) - Prefix matching of each token
analyzers.put(PARTIAL_ANALYZER, ImmutableMap.builder()
@@ -395,6 +447,7 @@ private static Map buildAnalyzers(String mainTokenizer) {
.put(FILTER, PARTIAL_AUTOCOMPLETE_TOKEN_FILTERS)
.build());
+
return analyzers.build();
}
}
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java
index fb7e19a5d67bcb..a75ed40ffca529 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java
@@ -11,11 +11,8 @@
import java.util.Set;
-import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_HIERARCHY_ANALYZER;
-import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_V2_HIERARCHY_ANALYZER;
-import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.KEYWORD_ANALYZER;
-import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.TEXT_SEARCH_ANALYZER;
-import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.URN_SEARCH_ANALYZER;
+import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*;
+
@Builder
@Getter
@@ -33,7 +30,8 @@ public class SearchFieldConfig {
private static final Set TYPES_WITH_DELIMITED_SUBFIELD =
Set.of(
SearchableAnnotation.FieldType.TEXT,
- SearchableAnnotation.FieldType.TEXT_PARTIAL
+ SearchableAnnotation.FieldType.TEXT_PARTIAL,
+ SearchableAnnotation.FieldType.WORD_GRAM
// NOT URN_PARTIAL (urn field is special)
);
// NOT comprehensive
@@ -56,6 +54,7 @@ public class SearchFieldConfig {
SearchableAnnotation.FieldType.TEXT,
SearchableAnnotation.FieldType.TEXT_PARTIAL,
SearchableAnnotation.FieldType.KEYWORD,
+ SearchableAnnotation.FieldType.WORD_GRAM,
// not analyzed
SearchableAnnotation.FieldType.BOOLEAN,
SearchableAnnotation.FieldType.COUNT,
@@ -69,6 +68,11 @@ public class SearchFieldConfig {
SearchableAnnotation.FieldType.URN_PARTIAL
);
+ public static final Set TYPES_WITH_WORD_GRAM =
+ Set.of(
+ SearchableAnnotation.FieldType.WORD_GRAM
+ );
+
@Nonnull
private final String fieldName;
@Nonnull
@@ -78,9 +82,11 @@ public class SearchFieldConfig {
private final String analyzer;
private boolean hasKeywordSubfield;
private boolean hasDelimitedSubfield;
+ private boolean hasWordGramSubfields;
private boolean isQueryByDefault;
private boolean isDelimitedSubfield;
private boolean isKeywordSubfield;
+ private boolean isWordGramSubfield;
public static SearchFieldConfig detectSubFieldType(@Nonnull SearchableFieldSpec fieldSpec) {
final SearchableAnnotation searchableAnnotation = fieldSpec.getSearchableAnnotation();
@@ -106,6 +112,7 @@ public static SearchFieldConfig detectSubFieldType(String fieldName,
.analyzer(getAnalyzer(fieldName, fieldType))
.hasKeywordSubfield(hasKeywordSubfield(fieldName, fieldType))
.hasDelimitedSubfield(hasDelimitedSubfield(fieldName, fieldType))
+ .hasWordGramSubfields(hasWordGramSubfields(fieldName, fieldType))
.isQueryByDefault(isQueryByDefault)
.build();
}
@@ -118,6 +125,11 @@ private static boolean hasDelimitedSubfield(String fieldName, SearchableAnnotati
return !fieldName.contains(".")
&& ("urn".equals(fieldName) || TYPES_WITH_DELIMITED_SUBFIELD.contains(fieldType));
}
+
+ private static boolean hasWordGramSubfields(String fieldName, SearchableAnnotation.FieldType fieldType) {
+ return !fieldName.contains(".")
+ && (TYPES_WITH_WORD_GRAM.contains(fieldType));
+ }
private static boolean hasKeywordSubfield(String fieldName, SearchableAnnotation.FieldType fieldType) {
return !"urn".equals(fieldName)
&& !fieldName.contains(".")
@@ -155,6 +167,7 @@ public SearchFieldConfigBuilder fieldName(@Nonnull String fieldName) {
this.fieldName = fieldName;
isDelimitedSubfield(fieldName.endsWith(".delimited"));
isKeywordSubfield(fieldName.endsWith(".keyword"));
+ isWordGramSubfield(fieldName.contains("wordGrams"));
shortName(fieldName.split("[.]")[0]);
return this;
}
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java
index 289c6f1f84e323..49fc882314e0a0 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java
@@ -3,6 +3,7 @@
import com.linkedin.metadata.config.search.ExactMatchConfiguration;
import com.linkedin.metadata.config.search.PartialConfiguration;
import com.linkedin.metadata.config.search.SearchConfiguration;
+import com.linkedin.metadata.config.search.WordGramConfiguration;
import com.linkedin.metadata.config.search.custom.BoolQueryConfiguration;
import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration;
import com.linkedin.metadata.config.search.custom.QueryConfiguration;
@@ -51,6 +52,9 @@
import org.elasticsearch.search.SearchModule;
import static com.linkedin.metadata.models.SearchableFieldSpecExtractor.PRIMARY_URN_SEARCH_PROPERTIES;
+import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*;
+import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.*;
+
@Slf4j
public class SearchQueryBuilder {
@@ -69,6 +73,7 @@ public class SearchQueryBuilder {
public static final String STRUCTURED_QUERY_PREFIX = "\\\\/q ";
private final ExactMatchConfiguration exactMatchConfiguration;
private final PartialConfiguration partialConfiguration;
+ private final WordGramConfiguration wordGramConfiguration;
private final CustomizedQueryHandler customizedQueryHandler;
@@ -76,6 +81,7 @@ public SearchQueryBuilder(@Nonnull SearchConfiguration searchConfiguration,
@Nullable CustomSearchConfiguration customSearchConfiguration) {
this.exactMatchConfiguration = searchConfiguration.getExactMatch();
this.partialConfiguration = searchConfiguration.getPartial();
+ this.wordGramConfiguration = searchConfiguration.getWordGram();
this.customizedQueryHandler = CustomizedQueryHandler.builder(customSearchConfiguration).build();
}
@@ -148,6 +154,36 @@ private Set getStandardFields(@Nonnull EntitySpec entitySpec)
fields.add(SearchFieldConfig.detectSubFieldType(searchFieldConfig.fieldName() + ".delimited",
searchFieldConfig.boost() * partialConfiguration.getFactor(),
searchableAnnotation.getFieldType(), searchableAnnotation.isQueryByDefault()));
+
+ if (SearchFieldConfig.detectSubFieldType(fieldSpec).hasWordGramSubfields()) {
+ fields.add(SearchFieldConfig.builder()
+ .fieldName(searchFieldConfig.fieldName() + ".wordGrams2")
+ .boost(searchFieldConfig.boost() * wordGramConfiguration.getTwoGramFactor())
+ .analyzer(WORD_GRAM_2_ANALYZER)
+ .hasKeywordSubfield(true)
+ .hasDelimitedSubfield(true)
+ .hasWordGramSubfields(true)
+ .isQueryByDefault(true)
+ .build());
+ fields.add(SearchFieldConfig.builder()
+ .fieldName(searchFieldConfig.fieldName() + ".wordGrams3")
+ .boost(searchFieldConfig.boost() * wordGramConfiguration.getThreeGramFactor())
+ .analyzer(WORD_GRAM_3_ANALYZER)
+ .hasKeywordSubfield(true)
+ .hasDelimitedSubfield(true)
+ .hasWordGramSubfields(true)
+ .isQueryByDefault(true)
+ .build());
+ fields.add(SearchFieldConfig.builder()
+ .fieldName(searchFieldConfig.fieldName() + ".wordGrams4")
+ .boost(searchFieldConfig.boost() * wordGramConfiguration.getFourGramFactor())
+ .analyzer(WORD_GRAM_4_ANALYZER)
+ .hasKeywordSubfield(true)
+ .hasDelimitedSubfield(true)
+ .hasWordGramSubfields(true)
+ .isQueryByDefault(true)
+ .build());
+ }
}
}
@@ -188,7 +224,7 @@ private Optional getSimpleQuery(@Nullable QueryConfiguration custo
.filter(SearchFieldConfig::isQueryByDefault)
.collect(Collectors.groupingBy(SearchFieldConfig::analyzer));
- analyzerGroup.keySet().stream().sorted().forEach(analyzer -> {
+ analyzerGroup.keySet().stream().sorted().filter(str -> !str.contains("word_gram")).forEach(analyzer -> {
List fieldConfigs = analyzerGroup.get(analyzer);
SimpleQueryStringBuilder simpleBuilder = QueryBuilders.simpleQueryStringQuery(sanitizedQuery);
simpleBuilder.analyzer(analyzer);
@@ -253,6 +289,13 @@ private Optional getPrefixAndExactMatchQuery(@Nullable QueryConfig
* exactMatchConfiguration.getCaseSensitivityFactor())
.queryName(searchFieldConfig.fieldName()));
}
+
+ if (searchFieldConfig.isWordGramSubfield() && isPrefixQuery) {
+ finalQuery.should(QueryBuilders
+ .matchPhraseQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery)
+ .boost(searchFieldConfig.boost() * getWordGramFactor(searchFieldConfig.fieldName()))
+ .queryName(searchFieldConfig.shortName()));
+ }
});
return finalQuery.should().size() > 0 ? Optional.of(finalQuery) : Optional.empty();
@@ -377,4 +420,15 @@ private FunctionScoreQueryBuilder toFunctionScoreQueryBuilder(QueryBuilder query
throw new RuntimeException(e);
}
}
+
+ public float getWordGramFactor(String fieldName) {
+ if (fieldName.endsWith("Grams2")) {
+ return wordGramConfiguration.getTwoGramFactor();
+ } else if (fieldName.endsWith("Grams3")) {
+ return wordGramConfiguration.getThreeGramFactor();
+ } else if (fieldName.endsWith("Grams4")) {
+ return wordGramConfiguration.getFourGramFactor();
+ }
+ throw new IllegalArgumentException(fieldName + " does not end with Grams[2-4]");
+ }
}
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java
index bd1e6037ec0c51..5973f77da28aac 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java
@@ -28,6 +28,8 @@
import com.linkedin.metadata.search.SearchEntityArray;
import com.linkedin.metadata.search.SearchResult;
import com.linkedin.metadata.search.SearchResultMetadata;
+import com.linkedin.metadata.search.SearchSuggestion;
+import com.linkedin.metadata.search.SearchSuggestionArray;
import com.linkedin.metadata.search.features.Features;
import com.linkedin.metadata.search.utils.ESUtils;
import com.linkedin.metadata.utils.SearchUtil;
@@ -68,7 +70,9 @@
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
+import org.elasticsearch.search.suggest.term.TermSuggestion;
+import static com.linkedin.metadata.search.utils.ESUtils.NAME_SUGGESTION;
import static com.linkedin.metadata.search.utils.ESUtils.toFacetField;
import static com.linkedin.metadata.search.utils.SearchUtils.applyDefaultSearchFlags;
import static com.linkedin.metadata.utils.SearchUtil.*;
@@ -199,6 +203,11 @@ public SearchRequest getSearchRequest(@Nonnull String input, @Nullable Filter fi
searchSourceBuilder.highlighter(_highlights);
}
ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion);
+
+ if (finalSearchFlags.isGetSuggestions()) {
+ ESUtils.buildNameSuggestions(searchSourceBuilder, input);
+ }
+
searchRequest.source(searchSourceBuilder);
log.debug("Search request is: " + searchRequest.toString());
@@ -471,6 +480,9 @@ private SearchResultMetadata extractSearchResultMetadata(@Nonnull SearchResponse
final List aggregationMetadataList = extractAggregationMetadata(searchResponse, filter);
searchResultMetadata.setAggregations(new AggregationMetadataArray(aggregationMetadataList));
+ final List searchSuggestions = extractSearchSuggestions(searchResponse);
+ searchResultMetadata.setSuggestions(new SearchSuggestionArray(searchSuggestions));
+
return searchResultMetadata;
}
@@ -517,6 +529,23 @@ public static Map extractTermAggregations(@Nonnull SearchResponse
return extractTermAggregations((ParsedTerms) aggregation, aggregationName.equals("_entityType"));
}
+ private List extractSearchSuggestions(@Nonnull SearchResponse searchResponse) {
+ final List searchSuggestions = new ArrayList<>();
+ if (searchResponse.getSuggest() != null) {
+ TermSuggestion termSuggestion = searchResponse.getSuggest().getSuggestion(NAME_SUGGESTION);
+ if (termSuggestion != null && termSuggestion.getEntries().size() > 0) {
+ termSuggestion.getEntries().get(0).getOptions().forEach(suggestOption -> {
+ SearchSuggestion searchSuggestion = new SearchSuggestion();
+ searchSuggestion.setText(String.valueOf(suggestOption.getText()));
+ searchSuggestion.setFrequency(suggestOption.getFreq());
+ searchSuggestion.setScore(suggestOption.getScore());
+ searchSuggestions.add(searchSuggestion);
+ });
+ }
+ }
+ return searchSuggestions;
+ }
+
/**
* Adds nested sub-aggregation values to the aggregated results
* @param aggs The aggregations to traverse. Could be null (base case)
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java
index 8a385e4ab2b543..741eb5568d2ead 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java
@@ -27,6 +27,10 @@
import org.elasticsearch.search.sort.FieldSortBuilder;
import org.elasticsearch.search.sort.ScoreSortBuilder;
import org.elasticsearch.search.sort.SortOrder;
+import org.elasticsearch.search.suggest.SuggestBuilder;
+import org.elasticsearch.search.suggest.SuggestBuilders;
+import org.elasticsearch.search.suggest.SuggestionBuilder;
+import org.elasticsearch.search.suggest.term.TermSuggestionBuilder;
import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.KEYWORD_FIELDS;
import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.PATH_HIERARCHY_FIELDS;
@@ -45,6 +49,9 @@ public class ESUtils {
public static final int MAX_RESULT_SIZE = 10000;
public static final String OPAQUE_ID_HEADER = "X-Opaque-Id";
public static final String HEADER_VALUE_DELIMITER = "|";
+ public static final String KEYWORD_TYPE = "keyword";
+ public static final String ENTITY_NAME_FIELD = "_entityName";
+ public static final String NAME_SUGGESTION = "nameSuggestion";
// we use this to make sure we filter for editable & non-editable fields. Also expands out top-level properties
// to field level properties
@@ -174,6 +181,8 @@ public static QueryBuilder getQueryBuilderFromCriterion(@Nonnull final Criterion
* If no sort criterion is provided then the default sorting criterion is chosen which is descending order of score
* Furthermore to resolve conflicts, the results are further sorted by ascending order of urn
* If the input sort criterion is urn itself, then no additional sort criterion is applied as there will be no conflicts.
+ * When sorting, set the unmappedType param to arbitrary "keyword" so we essentially ignore sorting where indices do not
+ * have the field we are sorting on.
*
*
* @param searchSourceBuilder {@link SearchSourceBuilder} that needs to be populated with sort order
@@ -187,13 +196,24 @@ public static void buildSortOrder(@Nonnull SearchSourceBuilder searchSourceBuild
final SortOrder esSortOrder =
(sortCriterion.getOrder() == com.linkedin.metadata.query.filter.SortOrder.ASCENDING) ? SortOrder.ASC
: SortOrder.DESC;
- searchSourceBuilder.sort(new FieldSortBuilder(sortCriterion.getField()).order(esSortOrder));
+ searchSourceBuilder.sort(new FieldSortBuilder(sortCriterion.getField()).order(esSortOrder).unmappedType(KEYWORD_TYPE));
}
if (sortCriterion == null || !sortCriterion.getField().equals(DEFAULT_SEARCH_RESULTS_SORT_BY_FIELD)) {
searchSourceBuilder.sort(new FieldSortBuilder(DEFAULT_SEARCH_RESULTS_SORT_BY_FIELD).order(SortOrder.ASC));
}
}
+ /**
+ * Populates source field of search query with the suggestions query so that we get search suggestions back.
+ * Right now we are only supporting suggestions based on the virtual _entityName field alias.
+ */
+ public static void buildNameSuggestions(@Nonnull SearchSourceBuilder searchSourceBuilder, @Nullable String textInput) {
+ SuggestionBuilder builder = SuggestBuilders.termSuggestion(ENTITY_NAME_FIELD).text(textInput);
+ SuggestBuilder suggestBuilder = new SuggestBuilder();
+ suggestBuilder.addSuggestion(NAME_SUGGESTION, builder);
+ searchSourceBuilder.suggest(suggestBuilder);
+ }
+
/**
* Escapes the Elasticsearch reserved characters in the given input string.
*
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java
index 35a322d37b2fde..8b56ae0beb3f15 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java
@@ -78,7 +78,7 @@ public static Map getRequestMap(@Nullable Filter requestParams)
return criterionArray.stream().collect(Collectors.toMap(Criterion::getField, Criterion::getValue));
}
- static boolean isUrn(@Nonnull String value) {
+ public static boolean isUrn(@Nonnull String value) {
// TODO(https://github.com/datahub-project/datahub-gma/issues/51): This method is a bit of a hack to support searching for
// URNs that have commas in them, while also using commas a delimiter for search. We should stop supporting commas
// as delimiter, and then we can stop using this hack.
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java
index 1e5b860b581fc4..673474c96cc512 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java
@@ -6,6 +6,7 @@
import com.linkedin.metadata.config.search.ExactMatchConfiguration;
import com.linkedin.metadata.config.search.PartialConfiguration;
import com.linkedin.metadata.config.search.SearchConfiguration;
+import com.linkedin.metadata.config.search.WordGramConfiguration;
import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration;
import com.linkedin.metadata.models.registry.ConfigEntityRegistry;
import com.linkedin.metadata.models.registry.EntityRegistry;
@@ -55,11 +56,17 @@ public SearchConfiguration searchConfiguration() {
exactMatchConfiguration.setCaseSensitivityFactor(0.7f);
exactMatchConfiguration.setEnableStructured(true);
+ WordGramConfiguration wordGramConfiguration = new WordGramConfiguration();
+ wordGramConfiguration.setTwoGramFactor(1.2f);
+ wordGramConfiguration.setThreeGramFactor(1.5f);
+ wordGramConfiguration.setFourGramFactor(1.8f);
+
PartialConfiguration partialConfiguration = new PartialConfiguration();
partialConfiguration.setFactor(0.4f);
partialConfiguration.setUrnFactor(0.5f);
searchConfiguration.setExactMatch(exactMatchConfiguration);
+ searchConfiguration.setWordGram(wordGramConfiguration);
searchConfiguration.setPartial(partialConfiguration);
return searchConfiguration;
}
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java
index cc0d9dca6ae5f1..d720c95fef84d0 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java
@@ -78,23 +78,6 @@ public void testNameMatchPetProfile() {
assertTrue(secondResultUrn.toString().contains("pet_profiles"));
}
- @Test
- public void testNameMatchMemberInWorkspace() {
- /*
- Searching for "collaborative actionitems" should return "collaborative_actionitems" as the first search
- result, followed by "collaborative_actionitems_old"
- */
- assertNotNull(searchService);
- SearchResult searchResult = searchAcrossEntities(searchService, "collaborative actionitems", SEARCHABLE_LONGTAIL_ENTITIES);
- assertTrue(searchResult.getEntities().size() >= 2);
- Urn firstResultUrn = searchResult.getEntities().get(0).getEntity();
- Urn secondResultUrn = searchResult.getEntities().get(1).getEntity();
-
- // Checks that the table name is not suffixed with anything
- assertTrue(firstResultUrn.toString().contains("collaborative_actionitems,"));
- assertTrue(secondResultUrn.toString().contains("collaborative_actionitems_old"));
- }
-
@Test
public void testGlossaryTerms() {
/*
@@ -116,15 +99,7 @@ public void testGlossaryTerms() {
assertTrue(fourthResultMatchedFields.toString().contains("ReturnRate"));
}
- /**
- *
- * The test below should be added back in as improvements are made to search,
- * via the linked tickets.
- *
- **/
-
- // TODO: enable once PFP-481 is complete
- @Test(enabled = false)
+ @Test
public void testNameMatchPartiallyQualified() {
/*
Searching for "analytics.pet_details" (partially qualified) should return the fully qualified table
@@ -140,4 +115,53 @@ public void testNameMatchPartiallyQualified() {
assertTrue(secondResultUrn.toString().contains("dbt,long_tail_companions.analytics.pet_details"));
}
+ @Test
+ public void testNameMatchCollaborativeActionitems() {
+ /*
+ Searching for "collaborative actionitems" should return "collaborative_actionitems" as the first search
+ result, followed by "collaborative_actionitems_old"
+ */
+ assertNotNull(searchService);
+ SearchResult searchResult = searchAcrossEntities(searchService, "collaborative actionitems", SEARCHABLE_LONGTAIL_ENTITIES);
+ assertTrue(searchResult.getEntities().size() >= 2);
+ Urn firstResultUrn = searchResult.getEntities().get(0).getEntity();
+ Urn secondResultUrn = searchResult.getEntities().get(1).getEntity();
+
+ // Checks that the table name is not suffixed with anything
+ assertTrue(firstResultUrn.toString().contains("collaborative_actionitems,"));
+ assertTrue(secondResultUrn.toString().contains("collaborative_actionitems_old"));
+
+ Double firstResultScore = searchResult.getEntities().get(0).getScore();
+ Double secondResultScore = searchResult.getEntities().get(1).getScore();
+
+ // Checks that the scores aren't tied so that we are matching on table name more than column name
+ assertTrue(firstResultScore > secondResultScore);
+ }
+
+ @Test
+ public void testNameMatchCustomerOrders() {
+ /*
+ Searching for "customer orders" should return "customer_orders" as the first search
+ result, not suffixed by anything
+ */
+ assertNotNull(searchService);
+ SearchResult searchResult = searchAcrossEntities(searchService, "customer orders", SEARCHABLE_LONGTAIL_ENTITIES);
+ assertTrue(searchResult.getEntities().size() >= 2);
+ Urn firstResultUrn = searchResult.getEntities().get(0).getEntity();
+
+ // Checks that the table name is not suffixed with anything
+ assertTrue(firstResultUrn.toString().contains("customer_orders,"));
+
+ Double firstResultScore = searchResult.getEntities().get(0).getScore();
+ Double secondResultScore = searchResult.getEntities().get(1).getScore();
+
+ // Checks that the scores aren't tied so that we are matching on table name more than column name
+ assertTrue(firstResultScore > secondResultScore);
+ }
+
+ /*
+ Tests that should pass but do not yet can be added below here, with the following annotation:
+ @Test(enabled = false)
+ */
+
}
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java
index 2f1e48c18450d9..d989d4ef4fa87e 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java
@@ -358,6 +358,84 @@ public void testDelimitedSynonym() throws IOException {
}).collect(Collectors.toList());
}
+ @Test
+ public void testNegateAnalysis() throws IOException {
+ String queryWithMinus = "logging_events -bckp";
+ AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer(
+ "smpldat_datasetindex_v2",
+ "query_word_delimited", queryWithMinus
+ );
+ assertEquals(getTokens(request)
+ .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()),
+ List.of("logging_events -bckp", "logging_ev", "-bckp", "log", "event", "bckp"));
+
+ request = AnalyzeRequest.withIndexAnalyzer(
+ "smpldat_datasetindex_v2",
+ "word_gram_3", queryWithMinus
+ );
+ assertEquals(getTokens(request)
+ .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("logging events -bckp"));
+
+ request = AnalyzeRequest.withIndexAnalyzer(
+ "smpldat_datasetindex_v2",
+ "word_gram_4", queryWithMinus
+ );
+ assertEquals(getTokens(request)
+ .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of());
+
+ }
+
+ @Test
+ public void testWordGram() throws IOException {
+ String text = "hello.cat_cool_customer";
+ AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", text);
+ assertEquals(getTokens(request)
+ .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat", "cat cool", "cool customer"));
+ request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", text);
+ assertEquals(getTokens(request)
+ .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool", "cat cool customer"));
+ request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", text);
+ assertEquals(getTokens(request)
+ .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool customer"));
+
+ String testMoreSeparators = "quick.brown:fox jumped-LAZY_Dog";
+ request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", testMoreSeparators);
+ assertEquals(getTokens(request)
+ .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()),
+ List.of("quick brown", "brown fox", "fox jumped", "jumped lazy", "lazy dog"));
+ request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", testMoreSeparators);
+ assertEquals(getTokens(request)
+ .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()),
+ List.of("quick brown fox", "brown fox jumped", "fox jumped lazy", "jumped lazy dog"));
+ request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", testMoreSeparators);
+ assertEquals(getTokens(request)
+ .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()),
+ List.of("quick brown fox jumped", "brown fox jumped lazy", "fox jumped lazy dog"));
+
+ String textWithQuotesAndDuplicateWord = "\"my_db.my_exact_table\"";
+ request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", textWithQuotesAndDuplicateWord);
+ assertEquals(getTokens(request)
+ .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db", "db my", "my exact", "exact table"));
+ request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", textWithQuotesAndDuplicateWord);
+ assertEquals(getTokens(request)
+ .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my", "db my exact", "my exact table"));
+ request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", textWithQuotesAndDuplicateWord);
+ assertEquals(getTokens(request)
+ .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my exact", "db my exact table"));
+
+ String textWithParens = "(hi) there";
+ request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", textWithParens);
+ assertEquals(getTokens(request)
+ .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hi there"));
+
+ String oneWordText = "hello";
+ for (String analyzer : List.of("word_gram_2", "word_gram_3", "word_gram_4")) {
+ request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", analyzer, oneWordText);
+ assertEquals(getTokens(request)
+ .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of());
+ }
+ }
+
@Test
public void testUrnSynonym() throws IOException {
List expectedTokens = List.of("bigquery");
@@ -1267,6 +1345,53 @@ public void testParens() {
String.format("%s - Expected search results to include matched fields", query));
assertEquals(result.getEntities().size(), 2);
}
+ @Test
+ public void testGram() {
+ String query = "jaffle shop customers";
+ SearchResult result = searchAcrossEntities(searchService, query);
+ assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
+ String.format("%s - Expected search results", query));
+
+ assertEquals(result.getEntities().get(0).getEntity().toString(),
+ "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers,PROD)",
+ "Expected exact match in 1st position");
+
+ query = "shop customers source";
+ result = searchAcrossEntities(searchService, query);
+ assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
+ String.format("%s - Expected search results", query));
+
+ assertEquals(result.getEntities().get(0).getEntity().toString(),
+ "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers_source,PROD)",
+ "Expected ngram match in 1st position");
+
+ query = "jaffle shop stg customers";
+ result = searchAcrossEntities(searchService, query);
+ assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
+ String.format("%s - Expected search results", query));
+
+ assertEquals(result.getEntities().get(0).getEntity().toString(),
+ "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.stg_customers,PROD)",
+ "Expected ngram match in 1st position");
+
+ query = "jaffle shop transformers customers";
+ result = searchAcrossEntities(searchService, query);
+ assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
+ String.format("%s - Expected search results", query));
+
+ assertEquals(result.getEntities().get(0).getEntity().toString(),
+ "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.transformers_customers,PROD)",
+ "Expected ngram match in 1st position");
+
+ query = "shop raw customers";
+ result = searchAcrossEntities(searchService, query);
+ assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
+ String.format("%s - Expected search results", query));
+
+ assertEquals(result.getEntities().get(0).getEntity().toString(),
+ "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_customers,PROD)",
+ "Expected ngram match in 1st position");
+ }
@Test
public void testPrefixVsExact() {
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java
index ed72b46e98c46c..0b331855492990 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java
@@ -16,7 +16,7 @@ public void testMappingsBuilder() {
Map result = MappingsBuilder.getMappings(TestEntitySpecBuilder.getSpec());
assertEquals(result.size(), 1);
Map properties = (Map) result.get("properties");
- assertEquals(properties.size(), 17);
+ assertEquals(properties.size(), 19);
assertEquals(properties.get("urn"), ImmutableMap.of("type", "keyword",
"fields",
ImmutableMap.of("delimited",
@@ -66,6 +66,11 @@ public void testMappingsBuilder() {
assertTrue(textFieldSubfields.containsKey("delimited"));
assertTrue(textFieldSubfields.containsKey("keyword"));
+ // TEXT with addToFilters aliased under "_entityName"
+ Map textFieldAlias = (Map) properties.get("_entityName");
+ assertEquals(textFieldAlias.get("type"), "alias");
+ assertEquals(textFieldAlias.get("path"), "textFieldOverride");
+
// TEXT_PARTIAL
Map textArrayField = (Map) properties.get("textArrayField");
assertEquals(textArrayField.get("type"), "keyword");
@@ -76,6 +81,19 @@ public void testMappingsBuilder() {
assertTrue(textArrayFieldSubfields.containsKey("ngram"));
assertTrue(textArrayFieldSubfields.containsKey("keyword"));
+ // WORD_GRAM
+ Map wordGramField = (Map) properties.get("wordGramField");
+ assertEquals(wordGramField.get("type"), "keyword");
+ assertEquals(wordGramField.get("normalizer"), "keyword_normalizer");
+ Map wordGramFieldSubfields = (Map) wordGramField.get("fields");
+ assertEquals(wordGramFieldSubfields.size(), 6);
+ assertTrue(wordGramFieldSubfields.containsKey("delimited"));
+ assertTrue(wordGramFieldSubfields.containsKey("ngram"));
+ assertTrue(wordGramFieldSubfields.containsKey("keyword"));
+ assertTrue(wordGramFieldSubfields.containsKey("wordGrams2"));
+ assertTrue(wordGramFieldSubfields.containsKey("wordGrams3"));
+ assertTrue(wordGramFieldSubfields.containsKey("wordGrams4"));
+
// URN
Map foreignKey = (Map) properties.get("foreignKey");
assertEquals(foreignKey.get("type"), "text");
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java
index 10b4ee42b1a716..36c8bb8f9a6764 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java
@@ -31,7 +31,8 @@ public void testGetDefaultAggregationsHasFields() {
1.0,
Optional.of("hasTest"),
Optional.empty(),
- Collections.emptyMap()
+ Collections.emptyMap(),
+ Collections.emptyList()
);
SearchConfiguration config = new SearchConfiguration();
@@ -60,7 +61,8 @@ public void testGetDefaultAggregationsFields() {
1.0,
Optional.empty(),
Optional.empty(),
- Collections.emptyMap()
+ Collections.emptyMap(),
+ Collections.emptyList()
);
SearchConfiguration config = new SearchConfiguration();
@@ -89,7 +91,8 @@ public void testGetSpecificAggregationsHasFields() {
1.0,
Optional.of("hasTest1"),
Optional.empty(),
- Collections.emptyMap()
+ Collections.emptyMap(),
+ Collections.emptyList()
);
SearchableAnnotation annotation2 = new SearchableAnnotation(
@@ -104,7 +107,8 @@ public void testGetSpecificAggregationsHasFields() {
1.0,
Optional.empty(),
Optional.empty(),
- Collections.emptyMap()
+ Collections.emptyMap(),
+ Collections.emptyList()
);
SearchConfiguration config = new SearchConfiguration();
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java
index a2ec396c34b2d5..282b1d8bb67788 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java
@@ -4,6 +4,7 @@
import com.linkedin.metadata.config.search.ExactMatchConfiguration;
import com.linkedin.metadata.config.search.PartialConfiguration;
import com.linkedin.metadata.config.search.SearchConfiguration;
+import com.linkedin.metadata.config.search.WordGramConfiguration;
import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration;
import com.fasterxml.jackson.dataformat.yaml.YAMLMapper;
import com.google.common.collect.ImmutableList;
@@ -18,6 +19,7 @@
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MatchAllQueryBuilder;
import org.elasticsearch.index.query.MatchPhrasePrefixQueryBuilder;
+import org.elasticsearch.index.query.MatchPhraseQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryStringQueryBuilder;
import org.elasticsearch.index.query.SimpleQueryStringBuilder;
@@ -46,11 +48,17 @@ public class SearchQueryBuilderTest {
exactMatchConfiguration.setCaseSensitivityFactor(0.7f);
exactMatchConfiguration.setEnableStructured(true);
+ WordGramConfiguration wordGramConfiguration = new WordGramConfiguration();
+ wordGramConfiguration.setTwoGramFactor(1.2f);
+ wordGramConfiguration.setThreeGramFactor(1.5f);
+ wordGramConfiguration.setFourGramFactor(1.8f);
+
PartialConfiguration partialConfiguration = new PartialConfiguration();
partialConfiguration.setFactor(0.4f);
partialConfiguration.setUrnFactor(0.7f);
testQueryConfig.setExactMatch(exactMatchConfiguration);
+ testQueryConfig.setWordGram(wordGramConfiguration);
testQueryConfig.setPartial(partialConfiguration);
}
public static final SearchQueryBuilder TEST_BUILDER = new SearchQueryBuilder(testQueryConfig, null);
@@ -70,16 +78,17 @@ public void testQueryBuilderFulltext() {
assertEquals(keywordQuery.value(), "testQuery");
assertEquals(keywordQuery.analyzer(), "keyword");
Map keywordFields = keywordQuery.fields();
- assertEquals(keywordFields.size(), 8);
+ assertEquals(keywordFields.size(), 9);
assertEquals(keywordFields, Map.of(
- "urn", 10.f,
- "textArrayField", 1.0f,
- "customProperties", 1.0f,
- "nestedArrayArrayField", 1.0f,
- "textFieldOverride", 1.0f,
- "nestedArrayStringField", 1.0f,
- "keyPart1", 10.0f,
- "esObjectField", 1.0f
+ "urn", 10.f,
+ "textArrayField", 1.0f,
+ "customProperties", 1.0f,
+ "wordGramField", 1.0f,
+ "nestedArrayArrayField", 1.0f,
+ "textFieldOverride", 1.0f,
+ "nestedArrayStringField", 1.0f,
+ "keyPart1", 10.0f,
+ "esObjectField", 1.0f
));
SimpleQueryStringBuilder urnComponentQuery = (SimpleQueryStringBuilder) analyzerGroupQuery.should().get(1);
@@ -99,7 +108,8 @@ public void testQueryBuilderFulltext() {
"nestedArrayArrayField.delimited", 0.4f,
"urn.delimited", 7.0f,
"textArrayField.delimited", 0.4f,
- "nestedArrayStringField.delimited", 0.4f
+ "nestedArrayStringField.delimited", 0.4f,
+ "wordGramField.delimited", 0.4f
));
BoolQueryBuilder boolPrefixQuery = (BoolQueryBuilder) shouldQueries.get(1);
@@ -109,21 +119,30 @@ public void testQueryBuilderFulltext() {
if (prefixQuery instanceof MatchPhrasePrefixQueryBuilder) {
MatchPhrasePrefixQueryBuilder builder = (MatchPhrasePrefixQueryBuilder) prefixQuery;
return Pair.of(builder.fieldName(), builder.boost());
- } else {
+ } else if (prefixQuery instanceof TermQueryBuilder) {
// exact
TermQueryBuilder builder = (TermQueryBuilder) prefixQuery;
return Pair.of(builder.fieldName(), builder.boost());
+ } else { // if (prefixQuery instanceof MatchPhraseQueryBuilder) {
+ // ngram
+ MatchPhraseQueryBuilder builder = (MatchPhraseQueryBuilder) prefixQuery;
+ return Pair.of(builder.fieldName(), builder.boost());
}
}).collect(Collectors.toList());
- assertEquals(prefixFieldWeights.size(), 22);
+ assertEquals(prefixFieldWeights.size(), 28);
List.of(
Pair.of("urn", 100.0f),
Pair.of("urn", 70.0f),
Pair.of("keyPart1.delimited", 16.8f),
Pair.of("keyPart1.keyword", 100.0f),
- Pair.of("keyPart1.keyword", 70.0f)
+ Pair.of("keyPart1.keyword", 70.0f),
+ Pair.of("wordGramField.wordGrams2", 1.44f),
+ Pair.of("wordGramField.wordGrams3", 2.25f),
+ Pair.of("wordGramField.wordGrams4", 3.2399998f),
+ Pair.of("wordGramField.keyword", 10.0f),
+ Pair.of("wordGramField.keyword", 7.0f)
).forEach(p -> assertTrue(prefixFieldWeights.contains(p), "Missing: " + p));
// Validate scorer
@@ -144,7 +163,7 @@ public void testQueryBuilderStructured() {
assertEquals(keywordQuery.queryString(), "testQuery");
assertNull(keywordQuery.analyzer());
Map keywordFields = keywordQuery.fields();
- assertEquals(keywordFields.size(), 16);
+ assertEquals(keywordFields.size(), 21);
assertEquals(keywordFields.get("keyPart1").floatValue(), 10.0f);
assertFalse(keywordFields.containsKey("keyPart3"));
assertEquals(keywordFields.get("textFieldOverride").floatValue(), 1.0f);
@@ -196,10 +215,14 @@ public void testCustomExactMatch() {
List queries = boolPrefixQuery.should().stream().map(prefixQuery -> {
if (prefixQuery instanceof MatchPhrasePrefixQueryBuilder) {
+ // prefix
return (MatchPhrasePrefixQueryBuilder) prefixQuery;
- } else {
+ } else if (prefixQuery instanceof TermQueryBuilder) {
// exact
return (TermQueryBuilder) prefixQuery;
+ } else { // if (prefixQuery instanceof MatchPhraseQueryBuilder) {
+ // ngram
+ return (MatchPhraseQueryBuilder) prefixQuery;
}
}).collect(Collectors.toList());
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java
index d66d6a0ab0e760..db56e2d34881b6 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java
@@ -7,6 +7,7 @@
import com.linkedin.data.template.StringArray;
import com.linkedin.metadata.ESTestConfiguration;
import com.linkedin.metadata.TestEntitySpecBuilder;
+import com.linkedin.metadata.config.search.WordGramConfiguration;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
@@ -65,11 +66,17 @@ public class SearchRequestHandlerTest extends AbstractTestNGSpringContextTests {
exactMatchConfiguration.setCaseSensitivityFactor(0.7f);
exactMatchConfiguration.setEnableStructured(true);
+ WordGramConfiguration wordGramConfiguration = new WordGramConfiguration();
+ wordGramConfiguration.setTwoGramFactor(1.2f);
+ wordGramConfiguration.setThreeGramFactor(1.5f);
+ wordGramConfiguration.setFourGramFactor(1.8f);
+
PartialConfiguration partialConfiguration = new PartialConfiguration();
partialConfiguration.setFactor(0.4f);
partialConfiguration.setUrnFactor(0.7f);
testQueryConfig.setExactMatch(exactMatchConfiguration);
+ testQueryConfig.setWordGram(wordGramConfiguration);
testQueryConfig.setPartial(partialConfiguration);
}
@@ -113,10 +120,10 @@ public void testSearchRequestHandler() {
HighlightBuilder highlightBuilder = sourceBuilder.highlighter();
List fields =
highlightBuilder.fields().stream().map(HighlightBuilder.Field::name).collect(Collectors.toList());
- assertEquals(fields.size(), 20);
+ assertEquals(fields.size(), 22);
List highlightableFields =
ImmutableList.of("keyPart1", "textArrayField", "textFieldOverride", "foreignKey", "nestedForeignKey",
- "nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField");
+ "nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField", "wordGramField");
highlightableFields.forEach(field -> {
assertTrue(fields.contains(field), "Missing: " + field);
assertTrue(fields.contains(field + ".*"), "Missing: " + field + ".*");
diff --git a/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl
index 4339a186f13045..9fea71003ae6e5 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl
@@ -20,8 +20,9 @@ record ChartInfo includes CustomProperties, ExternalReference {
* Title of the chart
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
- "enableAutocomplete": true
+ "fieldType": "WORD_GRAM",
+ "enableAutocomplete": true,
+ "fieldNameAliases": [ "_entityName" ]
}
title: string
diff --git a/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl
index 26745fe46caaac..526878cbe60d33 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl
@@ -15,9 +15,10 @@ record ContainerProperties includes CustomProperties, ExternalReference {
* Display name of the Asset Container
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: string
@@ -25,7 +26,7 @@ record ContainerProperties includes CustomProperties, ExternalReference {
* Fully-qualified name of the Container
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
@@ -61,4 +62,4 @@ record ContainerProperties includes CustomProperties, ExternalReference {
}
}
lastModified: optional TimeStamp
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl
index 5cb306039506e2..c436011eb58db6 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl
@@ -22,9 +22,10 @@ record DashboardInfo includes CustomProperties, ExternalReference {
* Title of the dashboard
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
title: string
@@ -126,4 +127,4 @@ record DashboardInfo includes CustomProperties, ExternalReference {
* The time when this dashboard last refreshed
*/
lastRefreshed: optional Time
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl
index 481240740876a1..2ff3e8cd930afc 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl
@@ -17,9 +17,10 @@ record DataFlowInfo includes CustomProperties, ExternalReference {
* Flow name
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: string
diff --git a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl
index 8737dd4d9ef52c..250fb760037776 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl
@@ -18,9 +18,10 @@ record DataJobInfo includes CustomProperties, ExternalReference {
* Job name
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: string
diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl
index acc40e9f693ec0..5dd35c7f49520f 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl
@@ -15,9 +15,10 @@ record DataPlatformInfo {
*/
@validate.strlen.max = 15
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": false,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: string
@@ -25,7 +26,7 @@ record DataPlatformInfo {
* The name that will be used for displaying a platform type.
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl
index d7ce5565103ee3..b24e220ac3bcfb 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl
@@ -16,9 +16,10 @@ record DataPlatformInstanceProperties includes CustomProperties, ExternalReferen
* Display name of the Data Platform Instance
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: optional string
diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl
index 72eefd5e294e45..c63cb1a97c017d 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl
@@ -19,7 +19,7 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc
* Process name
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
@@ -31,6 +31,7 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc
@Searchable = {
"fieldType": "KEYWORD",
"addToFilters": true,
+ "fieldName": "processType",
"filterNameOverride": "Process Type"
}
type: optional enum DataProcessType {
diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl
index 3861b7def7669b..b2d26094fd0b79 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl
@@ -13,9 +13,10 @@ record DataProductProperties includes CustomProperties, ExternalReference {
* Display name of the Data Product
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: optional string
diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl
index 57b1fe76931299..ad8705a29d4ed3 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl
@@ -17,9 +17,10 @@ record DatasetProperties includes CustomProperties, ExternalReference {
* Display name of the Dataset
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: optional string
@@ -27,7 +28,7 @@ record DatasetProperties includes CustomProperties, ExternalReference {
* Fully-qualified name of the Dataset
*/
@Searchable = {
- "fieldType": "TEXT",
+ "fieldType": "WORD_GRAM",
"addToFilters": false,
"enableAutocomplete": true,
"boostScore": 10.0
@@ -77,4 +78,4 @@ record DatasetProperties includes CustomProperties, ExternalReference {
*/
@deprecated = "Use GlobalTags aspect instead."
tags: array[string] = [ ]
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl
index 5a0b8657ecb472..5c8c8a4912e4c3 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl
@@ -14,9 +14,10 @@ record DomainProperties {
* Display name of the Domain
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: string
diff --git a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl
index 1e840e5a1df7ea..c3388d4f462d49 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl
@@ -35,9 +35,10 @@ record GlossaryNodeInfo {
*/
@Searchable = {
"fieldName": "displayName",
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: optional string
@@ -49,4 +50,4 @@ record GlossaryNodeInfo {
}
id: optional string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl
index aa2a8b31e3ddec..e987a71be7131d 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl
@@ -23,9 +23,10 @@ record GlossaryTermInfo includes CustomProperties {
* Display name of the term
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: optional string
@@ -75,4 +76,4 @@ record GlossaryTermInfo includes CustomProperties {
*/
@deprecated
rawSchema: optional string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpGroupInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpGroupInfo.pdl
index 8d764604237da1..28b87476c61bd8 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpGroupInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpGroupInfo.pdl
@@ -21,7 +21,8 @@ record CorpGroupInfo {
"fieldType": "TEXT_PARTIAL"
"queryByDefault": true,
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
displayName: optional string
diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl
index 6b050f484fedd2..48ee53377e5820 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl
@@ -45,7 +45,7 @@ record CorpUserEditableInfo {
* DataHub-native display name
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"queryByDefault": true,
"boostScore": 10.0
}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl
index 1cb705d426cc0e..382b120fa942a7 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl
@@ -26,10 +26,11 @@ record CorpUserInfo includes CustomProperties {
* displayName of this user , e.g. Hang Zhang(DataHQ)
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"queryByDefault": true,
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
displayName: optional string
@@ -89,7 +90,7 @@ record CorpUserInfo includes CustomProperties {
* Common name of this user, format is firstName + lastName (split by a whitespace)
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"queryByDefault": true,
"enableAutocomplete": true,
"boostScore": 10.0
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl
index 075cc14ddc83b5..9e65b8f6e9929a 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl
@@ -11,10 +11,10 @@ record CorpGroupKey {
* The URL-encoded name of the AD/LDAP group. Serves as a globally unique identifier within DataHub.
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"queryByDefault": true,
"enableAutocomplete": true,
"boostScore": 10.0
}
name: string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl
index d1a8a4bb5bb232..476a0ad9704b36 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl
@@ -12,7 +12,7 @@ record CorpUserKey {
*/
@Searchable = {
"fieldName": "ldap",
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"boostScore": 2.0,
"enableAutocomplete": true
}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl
index bcdb92f75d0558..d8342630248b61 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl
@@ -19,7 +19,7 @@ record DataFlowKey {
* Unique Identifier of the data flow
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true
}
flowId: string
@@ -31,4 +31,4 @@ record DataFlowKey {
"fieldType": "TEXT_PARTIAL"
}
cluster: string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl
index d0ac7dbca0f999..60ec51b464dcc2 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl
@@ -27,7 +27,7 @@ record DataJobKey {
* Unique Identifier of the data job
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true
}
jobId: string
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl
index a5c05029352c2e..4df1364a04ebe0 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl
@@ -13,7 +13,7 @@ record DataProcessKey {
* Process name i.e. an ETL job name
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 4.0
}
@@ -37,4 +37,4 @@ record DataProcessKey {
"queryByDefault": false
}
origin: FabricType
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl
index ea1f9510ed4389..70c5d174171afb 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl
@@ -25,7 +25,7 @@ record DatasetKey {
//This is no longer to be used for Dataset native name. Use name, qualifiedName from DatasetProperties instead.
@Searchable = {
"fieldName": "id"
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl
index 88697fe3ff3647..51a3bc00f4e9eb 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl
@@ -12,9 +12,9 @@ import com.linkedin.common.FabricType
record GlossaryNodeKey {
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true
}
name: string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl
index a9f35146da18eb..61bcd60cbc7549 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl
@@ -13,10 +13,10 @@ record GlossaryTermKey {
* The term name, which serves as a unique id
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"fieldName": "id"
}
name: string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl
index 579f1966977a97..050b954c89fb89 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl
@@ -20,9 +20,10 @@ record MLFeatureKey {
* Name of the feature
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 8.0
+ "boostScore": 8.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl
index 1f786ad417be72..175a7b0d31b004 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl
@@ -22,9 +22,10 @@ record MLFeatureTableKey {
* Name of the feature table
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 8.0
+ "boostScore": 8.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl
index 7c36f410fede32..daa1deceb5fc33 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl
@@ -19,9 +19,10 @@ record MLModelDeploymentKey {
* Name of the MLModelDeployment
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: string
@@ -35,4 +36,4 @@ record MLModelDeploymentKey {
"queryByDefault": false
}
origin: FabricType
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl
index 17c401c0b8c487..582a899633c2a1 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl
@@ -19,9 +19,10 @@ record MLModelGroupKey {
* Name of the MLModelGroup
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: string
@@ -33,4 +34,4 @@ record MLModelGroupKey {
"queryByDefault": false
}
origin: FabricType
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl
index 55fd2bc3708463..f097bbda738a2a 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl
@@ -19,9 +19,10 @@ record MLModelKey {
* Name of the MLModel
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: string
@@ -35,4 +36,4 @@ record MLModelKey {
"queryByDefault": false
}
origin: FabricType
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl
index 9eb67eaf5f651d..ef812df206b46b 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl
@@ -21,9 +21,10 @@ record MLPrimaryKeyKey {
* Name of the primary key
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 8.0
+ "boostScore": 8.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl
index 47f1a631b4a2cf..4622e32dce67b7 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl
@@ -11,10 +11,10 @@ record TagKey {
* The tag name, which serves as a unique id
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0,
"fieldName": "id"
}
name: string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl
index 05a94b8fabc4b5..be1a30c7f082c6 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl
@@ -28,4 +28,9 @@ record SearchFlags {
* Whether to skip aggregates/facets
*/
skipAggregates:optional boolean = false
+
+ /**
+ * Whether to request for search suggestions on the _entityName virtualized field
+ */
+ getSuggestions:optional boolean = false
}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchResultMetadata.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchResultMetadata.pdl
index 718d80ba4cb363..60f1b568f586a0 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchResultMetadata.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchResultMetadata.pdl
@@ -12,4 +12,9 @@ record SearchResultMetadata {
*/
aggregations: array[AggregationMetadata] = []
+ /**
+ * A list of search query suggestions based on the given query
+ */
+ suggestions: array[SearchSuggestion] = []
+
}
\ No newline at end of file
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchSuggestion.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchSuggestion.pdl
new file mode 100644
index 00000000000000..7776ec54fe03e6
--- /dev/null
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchSuggestion.pdl
@@ -0,0 +1,24 @@
+namespace com.linkedin.metadata.search
+
+/**
+ * The model for the search result
+ */
+record SearchSuggestion {
+
+ /**
+ * The suggestion text for this search query
+ */
+ text: string
+
+ /**
+ * The score for how close this suggestion is to the original search query.
+ * The closer to 1 means it is closer to the original query and 0 is further away.
+ */
+ score: float
+
+ /**
+ * How many matches there are with the suggested text for the given field
+ */
+ frequency: long
+
+}
\ No newline at end of file
diff --git a/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl
index 1f4dcf975f48c9..8ec5f262890f3c 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl
@@ -18,9 +18,10 @@ record NotebookInfo includes CustomProperties, ExternalReference {
* Title of the Notebook
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
title: string
diff --git a/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl
index 004df6e399be40..3e7b53beff5317 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl
@@ -14,7 +14,7 @@ record OwnershipTypeInfo {
* Display name of the Ownership Type
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
@@ -54,4 +54,4 @@ record OwnershipTypeInfo {
}
}
lastModified: AuditStamp
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl
index bb7e22900e168a..3ba19d348913bf 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl
@@ -29,7 +29,7 @@ record QueryProperties {
* Optional display name to identify the query.
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
@@ -69,4 +69,4 @@ record QueryProperties {
}
}
lastModified: AuditStamp
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl
index acebdf5558c59a..8422d3c49046ce 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl
@@ -14,9 +14,10 @@ record RoleProperties {
* Display name of the IAM Role in the external system
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: string
diff --git a/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl
index 41c500c6fff2f6..9df47fac3928ae 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl
@@ -11,9 +11,10 @@ record TagProperties {
* Display name of the tag
*/
@Searchable = {
- "fieldType": "TEXT_PARTIAL",
+ "fieldType": "WORD_GRAM",
"enableAutocomplete": true,
- "boostScore": 10.0
+ "boostScore": 10.0,
+ "fieldNameAliases": [ "_entityName" ]
}
name: string
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java
index 690528059b5557..f653ccf72cf545 100644
--- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java
@@ -250,11 +250,11 @@ private void addPoliciesToCache(final Map> cache
private void addPolicyToCache(final Map> cache, final DataHubPolicyInfo policy) {
final List privileges = policy.getPrivileges();
for (String privilege : privileges) {
- List existingPolicies = cache.getOrDefault(privilege, new ArrayList<>());
+ List existingPolicies = cache.containsKey(privilege) ? new ArrayList<>(cache.get(privilege)) : new ArrayList<>();
existingPolicies.add(policy);
cache.put(privilege, existingPolicies);
}
- List existingPolicies = cache.getOrDefault(ALL, new ArrayList<>());
+ List existingPolicies = cache.containsKey(ALL) ? new ArrayList<>(cache.get(ALL)) : new ArrayList<>();
existingPolicies.add(policy);
cache.put(ALL, existingPolicies);
}
diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/SearchResultVisualConfig.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/SearchResultVisualConfig.java
new file mode 100644
index 00000000000000..7094bbd710f75f
--- /dev/null
+++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/SearchResultVisualConfig.java
@@ -0,0 +1,11 @@
+package com.linkedin.metadata.config;
+
+import lombok.Data;
+
+@Data
+public class SearchResultVisualConfig {
+ /**
+ * The default tab to show first on a Domain entity profile. Defaults to React code sorting if not present.
+ */
+ public Boolean enableNameHighlight;
+}
diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java
index d1c357186e1ae8..14ac2406c22566 100644
--- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java
+++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java
@@ -22,4 +22,9 @@ public class VisualConfiguration {
* Queries tab related configurations
*/
public EntityProfileConfig entityProfile;
+
+ /**
+ * Search result related configurations
+ */
+ public SearchResultVisualConfig searchResult;
}
diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java
index 1a56db1bd68b0d..b2b5260dc5e70a 100644
--- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java
+++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java
@@ -11,4 +11,5 @@ public class SearchConfiguration {
private PartialConfiguration partial;
private CustomConfiguration custom;
private GraphQueryConfiguration graph;
+ private WordGramConfiguration wordGram;
}
diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java
new file mode 100644
index 00000000000000..624d2a4c63c4c2
--- /dev/null
+++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java
@@ -0,0 +1,11 @@
+package com.linkedin.metadata.config.search;
+
+import lombok.Data;
+
+
+@Data
+public class WordGramConfiguration {
+ private float twoGramFactor;
+ private float threeGramFactor;
+ private float fourGramFactor;
+}
diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml
index 9f7bf92039fdc1..d21442d0bf5c8b 100644
--- a/metadata-service/configuration/src/main/resources/application.yml
+++ b/metadata-service/configuration/src/main/resources/application.yml
@@ -111,6 +111,8 @@ visualConfig:
entityProfile:
# we only support default tab for domains right now. In order to implement for other entities, update React code
domainDefaultTab: ${DOMAIN_DEFAULT_TAB:} # set to DOCUMENTATION_TAB to show documentation tab first
+ searchResult:
+ enableNameHighlight: ${SEARCH_RESULT_NAME_HIGHLIGHT_ENABLED:true} # Enables visual highlighting on search result names/descriptions.
# Storage Layer
@@ -198,6 +200,10 @@ elasticsearch:
prefixFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_PREFIX_FACTOR:1.6} # boost multiplier when exact prefix
caseSensitivityFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_CASE_FACTOR:0.7} # stacked boost multiplier when case mismatch
enableStructured: ${ELASTICSEARCH_QUERY_EXACT_MATCH_ENABLE_STRUCTURED:true} # enable exact match on structured search
+ wordGram:
+ twoGramFactor: ${ELASTICSEARCH_QUERY_TWO_GRAM_FACTOR:1.2} # boost multiplier when match on 2-gram tokens
+ threeGramFactor: ${ELASTICSEARCH_QUERY_THREE_GRAM_FACTOR:1.5} # boost multiplier when match on 3-gram tokens
+ fourGramFactor: ${ELASTICSEARCH_QUERY_FOUR_GRAM_FACTOR:1.8} # boost multiplier when match on 4-gram tokens
# Field weight annotations are typically calibrated for exact match, if partial match is possible on the field use these adjustments
partial:
urnFactor: ${ELASTICSEARCH_QUERY_PARTIAL_URN_FACTOR:0.5} # multiplier on Urn token match, a partial match on Urn > non-Urn is assumed
@@ -318,4 +324,4 @@ cache:
search:
lineage:
ttlSeconds: ${CACHE_SEARCH_LINEAGE_TTL_SECONDS:86400} # 1 day
- lightningThreshold: ${CACHE_SEARCH_LINEAGE_LIGHTNING_THRESHOLD:300}
\ No newline at end of file
+ lightningThreshold: ${CACHE_SEARCH_LINEAGE_LIGHTNING_THRESHOLD:300}
diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json
index 7aeca546af3c90..e3beef5ac48719 100644
--- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json
+++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json
@@ -341,6 +341,7 @@
"doc" : "Title of the chart",
"Searchable" : {
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1279,6 +1280,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1405,6 +1407,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1464,6 +1467,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1865,6 +1869,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -2061,6 +2066,7 @@
"boostScore" : 10.0,
"enableAutocomplete" : true,
"fieldName" : "displayName",
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -2097,6 +2103,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -2161,6 +2168,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL",
"queryByDefault" : true
}
@@ -2340,6 +2348,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL",
"queryByDefault" : true
}
@@ -3217,6 +3226,7 @@
"Searchable" : {
"boostScore" : 8.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
} ],
@@ -3282,6 +3292,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -3867,6 +3878,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json
index 83ecaf41022c4f..0c9b49649bf1e9 100644
--- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json
+++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json
@@ -94,6 +94,7 @@
"doc" : "Title of the chart",
"Searchable" : {
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1326,6 +1327,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1471,6 +1473,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1530,6 +1533,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1922,6 +1926,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : false,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
},
"validate" : {
@@ -2111,6 +2116,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -2437,6 +2443,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL",
"queryByDefault" : true
}
@@ -2585,6 +2592,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL",
"queryByDefault" : true
}
@@ -3704,6 +3712,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -4302,6 +4311,7 @@
"Searchable" : {
"boostScore" : 8.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
} ],
@@ -4390,6 +4400,7 @@
"Searchable" : {
"boostScore" : 8.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
} ],
@@ -4484,6 +4495,7 @@
"Searchable" : {
"boostScore" : 8.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
} ],
@@ -4590,6 +4602,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -4696,6 +4709,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -4796,6 +4810,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -4879,6 +4894,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -5096,6 +5112,7 @@
"boostScore" : 10.0,
"enableAutocomplete" : true,
"fieldName" : "displayName",
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -5710,6 +5727,12 @@
"doc" : "Whether to skip aggregates/facets",
"default" : false,
"optional" : true
+ }, {
+ "name" : "getSuggestions",
+ "type" : "boolean",
+ "doc" : "Whether to request for search suggestions on the _entityName virtualized field",
+ "default" : false,
+ "optional" : true
} ]
}, {
"type" : "enum",
@@ -6081,6 +6104,31 @@
},
"doc" : "A list of search result metadata such as aggregations",
"default" : [ ]
+ }, {
+ "name" : "suggestions",
+ "type" : {
+ "type" : "array",
+ "items" : {
+ "type" : "record",
+ "name" : "SearchSuggestion",
+ "doc" : "The model for the search result",
+ "fields" : [ {
+ "name" : "text",
+ "type" : "string",
+ "doc" : "The suggestion text for this search query"
+ }, {
+ "name" : "score",
+ "type" : "float",
+ "doc" : "The score for how close this suggestion is to the original search query.\nThe closer to 1 means it is closer to the original query and 0 is further away."
+ }, {
+ "name" : "frequency",
+ "type" : "long",
+ "doc" : "How many matches there are with the suggested text for the given field"
+ } ]
+ }
+ },
+ "doc" : "A list of search query suggestions based on the given query",
+ "default" : [ ]
} ]
},
"doc" : "Metadata specific to the browse result of the queried path"
@@ -6187,7 +6235,7 @@
"type" : "int",
"doc" : "The total number of entities directly under searched path"
} ]
- }, "com.linkedin.metadata.search.SearchResultMetadata", "com.linkedin.metadata.snapshot.ChartSnapshot", "com.linkedin.metadata.snapshot.CorpGroupSnapshot", "com.linkedin.metadata.snapshot.CorpUserSnapshot", "com.linkedin.metadata.snapshot.DashboardSnapshot", "com.linkedin.metadata.snapshot.DataFlowSnapshot", "com.linkedin.metadata.snapshot.DataHubPolicySnapshot", "com.linkedin.metadata.snapshot.DataHubRetentionSnapshot", "com.linkedin.metadata.snapshot.DataJobSnapshot", "com.linkedin.metadata.snapshot.DataPlatformSnapshot", "com.linkedin.metadata.snapshot.DataProcessSnapshot", "com.linkedin.metadata.snapshot.DatasetSnapshot", "com.linkedin.metadata.snapshot.GlossaryNodeSnapshot", "com.linkedin.metadata.snapshot.GlossaryTermSnapshot", "com.linkedin.metadata.snapshot.MLFeatureSnapshot", "com.linkedin.metadata.snapshot.MLFeatureTableSnapshot", "com.linkedin.metadata.snapshot.MLModelDeploymentSnapshot", "com.linkedin.metadata.snapshot.MLModelGroupSnapshot", "com.linkedin.metadata.snapshot.MLModelSnapshot", "com.linkedin.metadata.snapshot.MLPrimaryKeySnapshot", "com.linkedin.metadata.snapshot.SchemaFieldSnapshot", "com.linkedin.metadata.snapshot.Snapshot", "com.linkedin.metadata.snapshot.TagSnapshot", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.DeploymentStatus", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLFeatureTableProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelDeploymentProperties", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelGroupProperties", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.MLPrimaryKeyProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", {
+ }, "com.linkedin.metadata.search.SearchResultMetadata", "com.linkedin.metadata.search.SearchSuggestion", "com.linkedin.metadata.snapshot.ChartSnapshot", "com.linkedin.metadata.snapshot.CorpGroupSnapshot", "com.linkedin.metadata.snapshot.CorpUserSnapshot", "com.linkedin.metadata.snapshot.DashboardSnapshot", "com.linkedin.metadata.snapshot.DataFlowSnapshot", "com.linkedin.metadata.snapshot.DataHubPolicySnapshot", "com.linkedin.metadata.snapshot.DataHubRetentionSnapshot", "com.linkedin.metadata.snapshot.DataJobSnapshot", "com.linkedin.metadata.snapshot.DataPlatformSnapshot", "com.linkedin.metadata.snapshot.DataProcessSnapshot", "com.linkedin.metadata.snapshot.DatasetSnapshot", "com.linkedin.metadata.snapshot.GlossaryNodeSnapshot", "com.linkedin.metadata.snapshot.GlossaryTermSnapshot", "com.linkedin.metadata.snapshot.MLFeatureSnapshot", "com.linkedin.metadata.snapshot.MLFeatureTableSnapshot", "com.linkedin.metadata.snapshot.MLModelDeploymentSnapshot", "com.linkedin.metadata.snapshot.MLModelGroupSnapshot", "com.linkedin.metadata.snapshot.MLModelSnapshot", "com.linkedin.metadata.snapshot.MLPrimaryKeySnapshot", "com.linkedin.metadata.snapshot.SchemaFieldSnapshot", "com.linkedin.metadata.snapshot.Snapshot", "com.linkedin.metadata.snapshot.TagSnapshot", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.DeploymentStatus", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLFeatureTableProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelDeploymentProperties", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelGroupProperties", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.MLPrimaryKeyProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", {
"type" : "record",
"name" : "SystemMetadata",
"namespace" : "com.linkedin.mxe",
diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json
index b1489df3db55e8..ffaefc8232e83d 100644
--- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json
+++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json
@@ -94,6 +94,7 @@
"doc" : "Title of the chart",
"Searchable" : {
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1032,6 +1033,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1158,6 +1160,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1217,6 +1220,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1618,6 +1622,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1806,6 +1811,7 @@
"boostScore" : 10.0,
"enableAutocomplete" : true,
"fieldName" : "displayName",
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1842,6 +1848,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1906,6 +1913,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL",
"queryByDefault" : true
}
@@ -2085,6 +2093,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL",
"queryByDefault" : true
}
@@ -2962,6 +2971,7 @@
"Searchable" : {
"boostScore" : 8.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
} ],
@@ -3027,6 +3037,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -3612,6 +3623,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json
index f4c2d16f84747f..e385c7c30b21a5 100644
--- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json
+++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json
@@ -94,6 +94,7 @@
"doc" : "Title of the chart",
"Searchable" : {
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1032,6 +1033,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1158,6 +1160,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1217,6 +1220,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1618,6 +1622,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1800,6 +1805,7 @@
"boostScore" : 10.0,
"enableAutocomplete" : true,
"fieldName" : "displayName",
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1836,6 +1842,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1900,6 +1907,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL",
"queryByDefault" : true
}
@@ -2079,6 +2087,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL",
"queryByDefault" : true
}
@@ -2956,6 +2965,7 @@
"Searchable" : {
"boostScore" : 8.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
} ],
@@ -3021,6 +3031,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -3606,6 +3617,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json
index 2676c2687bd722..b85c84be237950 100644
--- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json
+++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json
@@ -94,6 +94,7 @@
"doc" : "Title of the chart",
"Searchable" : {
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1326,6 +1327,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1471,6 +1473,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1530,6 +1533,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -1922,6 +1926,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : false,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
},
"validate" : {
@@ -2111,6 +2116,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -2431,6 +2437,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL",
"queryByDefault" : true
}
@@ -2579,6 +2586,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL",
"queryByDefault" : true
}
@@ -3698,6 +3706,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -4296,6 +4305,7 @@
"Searchable" : {
"boostScore" : 8.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
} ],
@@ -4384,6 +4394,7 @@
"Searchable" : {
"boostScore" : 8.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
} ],
@@ -4478,6 +4489,7 @@
"Searchable" : {
"boostScore" : 8.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
} ],
@@ -4584,6 +4596,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -4690,6 +4703,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -4790,6 +4804,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -4873,6 +4888,7 @@
"Searchable" : {
"boostScore" : 10.0,
"enableAutocomplete" : true,
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
@@ -5090,6 +5106,7 @@
"boostScore" : 10.0,
"enableAutocomplete" : true,
"fieldName" : "displayName",
+ "fieldNameAliases" : [ "_entityName" ],
"fieldType" : "TEXT_PARTIAL"
}
}, {
diff --git a/metadata-service/war/src/main/resources/boot/policies.json b/metadata-service/war/src/main/resources/boot/policies.json
index 3fddf3456ecd7c..3cda0269b79f16 100644
--- a/metadata-service/war/src/main/resources/boot/policies.json
+++ b/metadata-service/war/src/main/resources/boot/policies.json
@@ -19,6 +19,7 @@
"GENERATE_PERSONAL_ACCESS_TOKENS",
"MANAGE_ACCESS_TOKENS",
"MANAGE_DOMAINS",
+ "MANAGE_GLOBAL_ANNOUNCEMENTS",
"MANAGE_TESTS",
"MANAGE_GLOSSARIES",
"MANAGE_USER_CREDENTIALS",
@@ -102,6 +103,7 @@
"VIEW_ANALYTICS",
"GENERATE_PERSONAL_ACCESS_TOKENS",
"MANAGE_DOMAINS",
+ "MANAGE_GLOBAL_ANNOUNCEMENTS",
"MANAGE_TESTS",
"MANAGE_GLOSSARIES",
"MANAGE_TAGS",
@@ -190,6 +192,7 @@
"GENERATE_PERSONAL_ACCESS_TOKENS",
"MANAGE_ACCESS_TOKENS",
"MANAGE_DOMAINS",
+ "MANAGE_GLOBAL_ANNOUNCEMENTS",
"MANAGE_TESTS",
"MANAGE_GLOSSARIES",
"MANAGE_USER_CREDENTIALS",
@@ -283,6 +286,7 @@
"privileges":[
"GENERATE_PERSONAL_ACCESS_TOKENS",
"MANAGE_DOMAINS",
+ "MANAGE_GLOBAL_ANNOUNCEMENTS",
"MANAGE_GLOSSARIES",
"MANAGE_TAGS"
],
diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java
index c46d02a6eadf00..0b0d462f079bf8 100644
--- a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java
+++ b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java
@@ -64,6 +64,11 @@ public class PoliciesConfig {
"Manage Domains",
"Create and remove Asset Domains.");
+ public static final Privilege MANAGE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE = Privilege.of(
+ "MANAGE_GLOBAL_ANNOUNCEMENTS",
+ "Manage Home Page Posts",
+ "Create and delete home page posts");
+
public static final Privilege MANAGE_TESTS_PRIVILEGE = Privilege.of(
"MANAGE_TESTS",
"Manage Tests",
@@ -113,6 +118,7 @@ public class PoliciesConfig {
MANAGE_USERS_AND_GROUPS_PRIVILEGE,
VIEW_ANALYTICS_PRIVILEGE,
MANAGE_DOMAINS_PRIVILEGE,
+ MANAGE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE,
MANAGE_INGESTION_PRIVILEGE,
MANAGE_SECRETS_PRIVILEGE,
GENERATE_PERSONAL_ACCESS_TOKENS_PRIVILEGE,
@@ -192,8 +198,8 @@ public class PoliciesConfig {
public static final Privilege EDIT_ENTITY_PRIVILEGE = Privilege.of(
"EDIT_ENTITY",
- "Edit All",
- "The ability to edit any information about an entity. Super user privileges.");
+ "Edit Entity",
+ "The ability to edit any information about an entity. Super user privileges for the entity.");
public static final Privilege DELETE_ENTITY_PRIVILEGE = Privilege.of(
"DELETE_ENTITY",
diff --git a/perf-test/README.md b/perf-test/README.md
index 24fb064d3e28ac..191833361eae9e 100644
--- a/perf-test/README.md
+++ b/perf-test/README.md
@@ -58,7 +58,9 @@ locust -f perf-test/locustfiles/ingest.py
This will set up the web interface in http://localhost:8089 (unless the port is already taken). Once you click into it,
you should see the following
-![Locust Example](../docs/imgs/locust-example.png)
+
+
+
Input the number of users you would like to spawn and the spawn rate. Point the host to the deployed DataHub GMS (
locally, it should be http://localhost:8080). Click on the "Start swarming" button to start the load test.
diff --git a/smoke-test/run-quickstart.sh b/smoke-test/run-quickstart.sh
index 74ba6c9d24cc9f..08332c4fe76346 100755
--- a/smoke-test/run-quickstart.sh
+++ b/smoke-test/run-quickstart.sh
@@ -15,6 +15,5 @@ echo "test_user:test_pass" >> ~/.datahub/plugins/frontend/auth/user.props
echo "DATAHUB_VERSION = $DATAHUB_VERSION"
DATAHUB_TELEMETRY_ENABLED=false \
DOCKER_COMPOSE_BASE="file://$( dirname "$DIR" )" \
-datahub docker quickstart --version ${DATAHUB_VERSION} --kafka-setup --dump-logs-on-failure
-# --standalone_consumers
+datahub docker quickstart --version ${DATAHUB_VERSION} --standalone_consumers --dump-logs-on-failure --kafka-setup
diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js
index 1f40cdf602062d..e4e5a39ce1100d 100644
--- a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js
+++ b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js
@@ -4,68 +4,94 @@ const wrong_url = "https://www.linkedincom";
const correct_url = "https://www.linkedin.com";
describe("edit documentation and link to dataset", () => {
+ it("open test dataset page, edit documentation", () => {
+ //edit documentation and verify changes saved
+ cy.loginWithCredentials();
+ cy.visit(
+ "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema"
+ );
+ cy.get("[role='tab']").contains("Documentation").click();
+ cy.waitTextVisible("my hive dataset");
+ cy.waitTextVisible("Sample doc");
+ cy.clickOptionWithText("Edit");
+ cy.focused().clear();
+ cy.focused().type(documentation_edited);
+ cy.get("button").contains("Save").click();
+ cy.waitTextVisible("Description Updated");
+ cy.waitTextVisible(documentation_edited);
+ //return documentation to original state
+ cy.clickOptionWithText("Edit");
+ cy.focused().clear().wait(1000);
+ cy.focused().type("my hive dataset");
+ cy.get("button").contains("Save").click();
+ cy.waitTextVisible("Description Updated");
+ cy.waitTextVisible("my hive dataset");
+ });
- it("open test dataset page, edit documentation", () => {
- //edit documentation and verify changes saved
- cy.loginWithCredentials();
- cy.visit("/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema");
- cy.get("[role='tab']").contains("Documentation").click();
- cy.waitTextVisible("my hive dataset");
- cy.waitTextVisible("Sample doc");
- cy.clickOptionWithText("Edit");
- cy.focused().clear();
- cy.focused().type(documentation_edited);
- cy.get("button").contains("Save").click();
- cy.waitTextVisible("Description Updated");
- cy.waitTextVisible(documentation_edited);
- //return documentation to original state
- cy.clickOptionWithText("Edit");
- cy.focused().clear().wait(1000);
- cy.focused().type("my hive dataset");
- cy.get("button").contains("Save").click();
- cy.waitTextVisible("Description Updated");
- cy.waitTextVisible("my hive dataset");
- });
+ it("open test dataset page, remove and add dataset link", () => {
+ cy.loginWithCredentials();
+ cy.visit(
+ "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema"
+ );
+ cy.get("[role='tab']").contains("Documentation").click();
+ cy.contains("Sample doc").trigger("mouseover", { force: true });
+ cy.get('[data-icon="delete"]').click();
+ cy.waitTextVisible("Link Removed");
+ cy.get("button").contains("Add Link").click();
+ cy.get("#addLinkForm_url").type(wrong_url);
+ cy.waitTextVisible("This field must be a valid url.");
+ cy.focused().clear();
+ cy.waitTextVisible("A URL is required.");
+ cy.focused().type(correct_url);
+ cy.ensureTextNotPresent("This field must be a valid url.");
+ cy.get("#addLinkForm_label").type("Sample doc");
+ cy.get('[role="dialog"] button').contains("Add").click();
+ cy.waitTextVisible("Link Added");
+ cy.get("[role='tab']").contains("Documentation").click();
+ cy.get(`[href='${correct_url}']`).should("be.visible");
+ });
- it("open test dataset page, remove and add dataset link", () => {
- cy.loginWithCredentials();
- cy.visit("/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema");
- cy.get("[role='tab']").contains("Documentation").click();
- cy.contains("Sample doc").trigger("mouseover", { force: true });
- cy.get('[data-icon="delete"]').click();
- cy.waitTextVisible("Link Removed");
- cy.get("button").contains("Add Link").click();
- cy.get("#addLinkForm_url").type(wrong_url);
- cy.waitTextVisible("This field must be a valid url.");
- cy.focused().clear();
- cy.waitTextVisible("A URL is required.");
- cy.focused().type(correct_url);
- cy.ensureTextNotPresent("This field must be a valid url.");
- cy.get("#addLinkForm_label").type("Sample doc");
- cy.get('[role="dialog"] button').contains("Add").click();
- cy.waitTextVisible("Link Added");
- cy.get("[role='tab']").contains("Documentation").click();
- cy.get(`[href='${correct_url}']`).should("be.visible");
- });
+ it("open test domain page, remove and add dataset link", () => {
+ cy.loginWithCredentials();
+ cy.visit("/domain/urn:li:domain:marketing/Entities");
+ cy.get("[role='tab']").contains("Documentation").click();
+ cy.get("button").contains("Add Link").click();
+ cy.get("#addLinkForm_url").type(wrong_url);
+ cy.waitTextVisible("This field must be a valid url.");
+ cy.focused().clear();
+ cy.waitTextVisible("A URL is required.");
+ cy.focused().type(correct_url);
+ cy.ensureTextNotPresent("This field must be a valid url.");
+ cy.get("#addLinkForm_label").type("Sample doc");
+ cy.get('[role="dialog"] button').contains("Add").click();
+ cy.waitTextVisible("Link Added");
+ cy.get("[role='tab']").contains("Documentation").click();
+ cy.get(`[href='${correct_url}']`).should("be.visible");
+ cy.contains("Sample doc").trigger("mouseover", { force: true });
+ cy.get('[data-icon="delete"]').click();
+ cy.waitTextVisible("Link Removed");
+ });
- it("edit field documentation", () => {
- cy.loginWithCredentials();
- cy.visit("/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema");
- cy.get("tbody [data-icon='edit']").first().click({ force: true });
- cy.waitTextVisible("Update description");
- cy.waitTextVisible("Foo field description has changed");
- cy.focused().clear().wait(1000);
- cy.focused().type(documentation_edited);
- cy.get("button").contains("Update").click();
- cy.waitTextVisible("Updated!");
- cy.waitTextVisible(documentation_edited);
- cy.waitTextVisible("(edited)");
- cy.get("tbody [data-icon='edit']").first().click({ force: true });
- cy.focused().clear().wait(1000);
- cy.focused().type("Foo field description has changed");
- cy.get("button").contains("Update").click();
- cy.waitTextVisible("Updated!");
- cy.waitTextVisible("Foo field description has changed");
- cy.waitTextVisible("(edited)");
- });
-});
\ No newline at end of file
+ it("edit field documentation", () => {
+ cy.loginWithCredentials();
+ cy.visit(
+ "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema"
+ );
+ cy.get("tbody [data-icon='edit']").first().click({ force: true });
+ cy.waitTextVisible("Update description");
+ cy.waitTextVisible("Foo field description has changed");
+ cy.focused().clear().wait(1000);
+ cy.focused().type(documentation_edited);
+ cy.get("button").contains("Update").click();
+ cy.waitTextVisible("Updated!");
+ cy.waitTextVisible(documentation_edited);
+ cy.waitTextVisible("(edited)");
+ cy.get("tbody [data-icon='edit']").first().click({ force: true });
+ cy.focused().clear().wait(1000);
+ cy.focused().type("Foo field description has changed");
+ cy.get("button").contains("Update").click();
+ cy.waitTextVisible("Updated!");
+ cy.waitTextVisible("Foo field description has changed");
+ cy.waitTextVisible("(edited)");
+ });
+});
diff --git a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js
index 7686acfe50de0d..9559435ff01c85 100644
--- a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js
+++ b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js
@@ -64,6 +64,7 @@ describe("create and manage group", () => {
});
it("update group info", () => {
+ var expected_name = Cypress.env('ADMIN_USERNAME');
cy.loginWithCredentials();
cy.visit("/settings/identities/groups");
cy.clickOptionWithText(group_name);
@@ -77,13 +78,13 @@ describe("create and manage group", () => {
cy.contains("Test group description EDITED").should("be.visible");
cy.clickOptionWithText("Add Owners");
cy.contains("Search for users or groups...").click({ force: true });
- cy.focused().type(Cypress.env('ADMIN_USERNAME'));
- cy.get(".ant-select-item-option").contains(Cypress.env('ADMIN_USERNAME'), { matchCase: false }).click();
+ cy.focused().type(expected_name);
+ cy.get(".ant-select-item-option").contains(expected_name, { matchCase: false }).click();
cy.focused().blur();
- cy.contains(Cypress.env('ADMIN_USERNAME')).should("have.length", 1);
+ cy.contains(expected_name).should("have.length", 1);
cy.get('[role="dialog"] button').contains("Done").click();
cy.waitTextVisible("Owners Added");
- cy.contains(Cypress.env('ADMIN_USERNAME'), { matchCase: false }).should("be.visible");
+ cy.contains(expected_name, { matchCase: false }).should("be.visible");
cy.clickOptionWithText("Edit Group");
cy.waitTextVisible("Edit Profile");
cy.get("#email").type(`${test_id}@testemail.com`);
diff --git a/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl b/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl
index ed30244c31b175..6dff14133ee60a 100644
--- a/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl
+++ b/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl
@@ -14,7 +14,8 @@ record TestEntityInfo includes CustomProperties {
@Searchable = {
"fieldName": "textFieldOverride",
"fieldType": "TEXT",
- "addToFilters": true
+ "addToFilters": true,
+ "fieldNameAliases": [ "_entityName" ]
}
textField: optional string
@@ -25,6 +26,11 @@ record TestEntityInfo includes CustomProperties {
}
textArrayField: optional array[string]
+ @Searchable = {
+ "fieldType": "WORD_GRAM"
+ }
+ wordGramField: optional string
+
@Relationship = {
"name": "foreignKey",
"entityTypes": []