datahub-project · anshbansal · Aug 4, 2023 · Jul 24, 2023 · Aug 3, 2023 · Aug 3, 2023
diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
@@ -67,6 +67,7 @@
     StatefulIngestionConfigBase,
     StatefulIngestionSourceBase,
 )
+from datahub.ingestion.source_config.operation_config import is_profiling_enabled
 from datahub.metadata.com.linkedin.pegasus2avro.common import Status, SubTypes
 from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
 from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
@@ -161,6 +162,11 @@ class GlueSourceConfig(
         default=None, description=""
     )
 
+    def is_profiling_enabled(self) -> bool:
+        return self.profiling is not None and is_profiling_enabled(
+            self.profiling.operation_config
+        )
+
     @property
     def glue_client(self):
         return self.get_glue_client()
@@ -813,7 +819,9 @@ def _create_profile_mcp(
     def get_profile_if_enabled(
         self, mce: MetadataChangeEventClass, database_name: str, table_name: str
     ) -> Iterable[MetadataWorkUnit]:
-        if self.source_config.profiling:
+        # We don't need both checks only the second one
+        # but then lint believes that GlueProfilingConfig can be None
+        if self.source_config.profiling and self.source_config.is_profiling_enabled():
             # for cross-account ingestion
             kwargs = dict(
                 DatabaseName=database_name,

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -310,7 +310,7 @@ def metadata_read_capability_test(
                     project_id=project_id,
                     dataset_name=result[0].name,
                     tables={},
-                    with_data_read_permission=config.profiling.enabled,
+                    with_data_read_permission=config.is_profiling_enabled(),
                 )
                 if len(list(tables)) == 0:
                     return CapabilityReport(
@@ -612,7 +612,7 @@ def _process_project(
             )
         except Exception as e:
             error_message = f"Unable to get datasets for project {project_id}, skipping. The error was: {e}"
-            if self.config.profiling.enabled:
+            if self.config.is_profiling_enabled():
                 error_message = f"Unable to get datasets for project {project_id}, skipping. Does your service account has bigquery.datasets.get permission? The error was: {e}"
             logger.error(error_message)
             self.report.report_failure(
@@ -647,7 +647,7 @@ def _process_project(
 
             except Exception as e:
                 error_message = f"Unable to get tables for dataset {bigquery_dataset.name} in project {project_id}, skipping. Does your service account has bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permission? The error was: {e}"
-                if self.config.profiling.enabled:
+                if self.config.is_profiling_enabled():
                     error_message = f"Unable to get tables for dataset {bigquery_dataset.name} in project {project_id}, skipping. Does your service account has bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permission, bigquery.tables.getData permission? The error was: {e}"
 
                 trace = traceback.format_exc()
@@ -659,7 +659,7 @@ def _process_project(
                 )
                 continue
 
-        if self.config.profiling.enabled:
+        if self.config.is_profiling_enabled():
             logger.info(f"Starting profiling project {project_id}")
             self.report.set_ingestion_stage(project_id, "Profiling")
             yield from self.profiler.get_workunits(
@@ -793,7 +793,7 @@ def _process_schema(
         if self.config.include_views:
             db_views[dataset_name] = list(
                 BigQueryDataDictionary.get_views_for_dataset(
-                    conn, project_id, dataset_name, self.config.profiling.enabled
+                    conn, project_id, dataset_name, self.config.is_profiling_enabled()
                 )
             )
 
@@ -841,7 +841,7 @@ def _process_table(
 
         # We only collect profile ignore list if profiling is enabled and profile_table_level_only is false
         if (
-            self.config.profiling.enabled
+            self.config.is_profiling_enabled()
             and not self.config.profiling.profile_table_level_only
         ):
             table.columns_ignore_from_profiling = self.generate_profile_ignore_list(
@@ -1218,7 +1218,7 @@ def get_tables_for_dataset(
             # https://cloud.google.com/bigquery/docs/information-schema-partitions
             max_batch_size: int = (
                 self.config.number_of_datasets_process_in_batch
-                if not self.config.profiling.enabled
+                if not self.config.is_profiling_enabled()
                 else self.config.number_of_datasets_process_in_batch_if_profiling_enabled
             )
 
@@ -1235,7 +1235,7 @@ def get_tables_for_dataset(
                         project_id,
                         dataset_name,
                         items_to_get,
-                        with_data_read_permission=self.config.profiling.enabled,
+                        with_data_read_permission=self.config.is_profiling_enabled(),
                     )
                     items_to_get.clear()
 
@@ -1245,7 +1245,7 @@ def get_tables_for_dataset(
                     project_id,
                     dataset_name,
                     items_to_get,
-                    with_data_read_permission=self.config.profiling.enabled,
+                    with_data_read_permission=self.config.is_profiling_enabled(),
                 )
 
         self.report.metadata_extraction_sec[f"{project_id}.{dataset_name}"] = round(

diff --git a/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py b/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py
@@ -35,6 +35,10 @@
 from datahub.ingestion.api.source import Source, SourceReport
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.common.subtypes import DatasetSubTypes
+from datahub.ingestion.source_config.operation_config import (
+    OperationConfig,
+    is_profiling_enabled,
+)
 from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass
 from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     SchemaField,
@@ -199,6 +203,10 @@ class ElasticProfiling(ConfigModel):
         default=False,
         description="Whether to enable profiling for the elastic search source.",
     )
+    operation_config: OperationConfig = Field(
+        default_factory=OperationConfig,
+        description="Experimental feature. To specify operation configs.",
+    )
 
 
 class CollapseUrns(ConfigModel):
@@ -296,6 +304,11 @@ class ElasticsearchSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
         default_factory=CollapseUrns,
     )
 
+    def is_profiling_enabled(self) -> bool:
+        return self.profiling.enabled and is_profiling_enabled(
+            self.profiling.operation_config
+        )
+
     @validator("host")
     def host_colon_port_comma(cls, host_val: str) -> str:
         for entry in host_val.split(","):
@@ -511,7 +524,7 @@ def _extract_mcps(
                 ),
             )
 
-        if self.source_config.profiling.enabled:
+        if self.source_config.is_profiling_enabled():
             if self.cat_response is None:
                 self.cat_response = self.client.cat.indices(
                     params={

diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
@@ -7,6 +7,7 @@
 from pydantic.fields import Field
 
 from datahub.configuration.common import AllowDenyPattern, ConfigModel
+from datahub.ingestion.source_config.operation_config import OperationConfig
 
 _PROFILING_FLAGS_TO_REPORT = {
     "turn_off_expensive_profiling_metrics",
@@ -22,6 +23,10 @@ class GEProfilingConfig(ConfigModel):
     enabled: bool = Field(
         default=False, description="Whether profiling should be done."
     )
+    operation_config: OperationConfig = Field(
+        default_factory=OperationConfig,
+        description="Experimental feature. To specify operation configs.",
+    )
     limit: Optional[int] = Field(
         default=None,
         description="Max number of documents to profile. By default, profiles all documents.",

diff --git a/metadata-ingestion/src/datahub/ingestion/source/glue_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/glue_profiling_config.py
@@ -3,6 +3,7 @@
 from pydantic.fields import Field
 
 from datahub.configuration.common import AllowDenyPattern, ConfigModel
+from datahub.ingestion.source_config.operation_config import OperationConfig
 
 
 class GlueProfilingConfig(ConfigModel):
@@ -54,3 +55,8 @@ class GlueProfilingConfig(ConfigModel):
         default=AllowDenyPattern.allow_all(),
         description="""Regex patterns for filtering partitions for profile. The pattern should be a string like: "{'key':'value'}".""",
     )
+
+    operation_config: OperationConfig = Field(
+        default_factory=OperationConfig,
+        description="Experimental feature. To specify operation configs.",
+    )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py
@@ -199,7 +199,7 @@ def _create_iceberg_workunit(
         if dpi_aspect:
             yield dpi_aspect
 
-        if self.config.profiling.enabled:
+        if self.config.is_profiling_enabled():
             profiler = IcebergProfiler(self.report, self.config.profiling)
             yield from profiler.profile_table(dataset_name, dataset_urn, table)
 

diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py
@@ -22,6 +22,10 @@
 from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionConfigBase,
 )
+from datahub.ingestion.source_config.operation_config import (
+    OperationConfig,
+    is_profiling_enabled,
+)
 
 
 class IcebergProfilingConfig(ConfigModel):
@@ -41,6 +45,10 @@ class IcebergProfilingConfig(ConfigModel):
         default=True,
         description="Whether to profile for the max value of numeric columns.",
     )
+    operation_config: OperationConfig = Field(
+        default_factory=OperationConfig,
+        description="Experimental feature. To specify operation configs.",
+    )
     # Stats we cannot compute without looking at data
     # include_field_mean_value: bool = True
     # include_field_median_value: bool = True
@@ -82,6 +90,11 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin)
     )
     profiling: IcebergProfilingConfig = IcebergProfilingConfig()
 
+    def is_profiling_enabled(self) -> bool:
+        return self.profiling.enabled and is_profiling_enabled(
+            self.profiling.operation_config
+        )
+
     @root_validator()
     def _ensure_one_filesystem_is_configured(
         cls: "IcebergSourceConfig", values: Dict

diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py
@@ -51,7 +51,7 @@ def get_workunits(
     ) -> Iterable[MetadataWorkUnit]:
         # Extra default SQLAlchemy option for better connection pooling and threading.
         # https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
-        if self.config.profiling.enabled:
+        if self.config.is_profiling_enabled():
             self.config.options.setdefault(
                 "max_overflow", self.config.profiling.max_workers
             )

diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
@@ -408,7 +408,7 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit
                 connection=connection, all_tables=all_tables, database=database
             )
 
-        if self.config.profiling.enabled:
+        if self.config.is_profiling_enabled():
             profiler = RedshiftProfiler(
                 config=self.config,
                 report=self.report,

diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py
@@ -18,6 +18,7 @@
 from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionConfigBase,
 )
+from datahub.ingestion.source_config.operation_config import is_profiling_enabled
 
 # hide annoying debug errors from py4j
 logging.getLogger("py4j").setLevel(logging.ERROR)
@@ -84,6 +85,11 @@ class DataLakeSourceConfig(
         "path_spec", "path_specs", lambda path_spec: [path_spec]
     )
 
+    def is_profiling_enabled(self) -> bool:
+        return self.profiling.enabled and is_profiling_enabled(
+            self.profiling.operation_config
+        )
+
     @pydantic.validator("path_specs", always=True)
     def check_path_specs_and_infer_platform(
         cls, path_specs: List[PathSpec], values: Dict

diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/datalake_profiler_config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/datalake_profiler_config.py
@@ -5,12 +5,17 @@
 
 from datahub.configuration import ConfigModel
 from datahub.configuration.common import AllowDenyPattern
+from datahub.ingestion.source_config.operation_config import OperationConfig
 
 
 class DataLakeProfilerConfig(ConfigModel):
     enabled: bool = Field(
         default=False, description="Whether profiling should be done."
     )
+    operation_config: OperationConfig = Field(
+        default_factory=OperationConfig,
+        description="Experimental feature. To specify operation configs.",
+    )
 
     # These settings will override the ones below.
     profile_table_level_only: bool = Field(

diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py
@@ -264,14 +264,17 @@ def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext):
             config_option: config.dict().get(config_option)
             for config_option in config_options_to_report
         }
-        config_report = {**config_report, "profiling_enabled": config.profiling.enabled}
+        config_report = {
+            **config_report,
+            "profiling_enabled": config.is_profiling_enabled(),
+        }
 
         telemetry.telemetry_instance.ping(
             "data_lake_config",
             config_report,
         )
 
-        if config.profiling.enabled:
+        if config.is_profiling_enabled():
             telemetry.telemetry_instance.ping(
                 "data_lake_profiling_config",
                 {
@@ -659,7 +662,7 @@ def ingest_table(
             table_data.table_path, dataset_urn
         )
 
-        if self.source_config.profiling.enabled:
+        if self.source_config.is_profiling_enabled():
             yield from self.get_table_profile(table_data, dataset_urn)
 
     def get_prefix(self, relative_path: str) -> str:
@@ -884,7 +887,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
                 for guid, table_data in table_dict.items():
                     yield from self.ingest_table(table_data, path_spec)
 
-            if not self.source_config.profiling.enabled:
+            if not self.source_config.is_profiling_enabled():
                 return
 
             total_time_taken = timer.elapsed_seconds()

diff --git a/metadata-ingestion/src/datahub/ingestion/source/salesforce.py b/metadata-ingestion/src/datahub/ingestion/source/salesforce.py
@@ -30,6 +30,10 @@
 from datahub.ingestion.api.source import Source, SourceReport
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.common.subtypes import DatasetSubTypes
+from datahub.ingestion.source_config.operation_config import (
+    OperationConfig,
+    is_profiling_enabled,
+)
 from datahub.metadata.schema_classes import (
     AuditStampClass,
     BooleanTypeClass,
@@ -70,6 +74,10 @@ class SalesforceProfilingConfig(ConfigModel):
         default=False,
         description="Whether profiling should be done. Supports only table-level profiling at this stage",
     )
+    operation_config: OperationConfig = Field(
+        default_factory=OperationConfig,
+        description="Experimental feature. To specify operation configs.",
+    )
 
     # TODO - support field level profiling
 
@@ -124,6 +132,11 @@ class SalesforceConfig(DatasetSourceConfigMixin):
         description="Regex patterns for profiles to filter in ingestion, allowed by the `object_pattern`.",
     )
 
+    def is_profiling_enabled(self) -> bool:
+        return self.profiling.enabled and is_profiling_enabled(
+            self.profiling.operation_config
+        )
+
     @validator("instance_url")
     def remove_trailing_slash(cls, v):
         return config_clean.remove_trailing_slashes(v)
@@ -329,7 +342,7 @@ def get_salesforce_object_workunits(
         if self.config.domain is not None:
             yield from self.get_domain_workunit(sObjectName, datasetUrn)
 
-        if self.config.profiling.enabled and self.config.profile_pattern.allowed(
+        if self.config.is_profiling_enabled() and self.config.profile_pattern.allowed(
             sObjectName
         ):
             yield from self.get_profile_workunit(sObjectName, datasetUrn)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py
@@ -58,7 +58,7 @@ def get_workunits(
     ) -> Iterable[MetadataWorkUnit]:
         # Extra default SQLAlchemy option for better connection pooling and threading.
         # https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
-        if self.config.profiling.enabled:
+        if self.config.is_profiling_enabled():
             self.config.options.setdefault(
                 "max_overflow", self.config.profiling.max_workers
             )

diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
@@ -272,7 +272,7 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config):
                 run_id=self.ctx.run_id,
             )
 
-        if config.profiling.enabled:
+        if config.is_profiling_enabled():
             # For profiling
             self.profiler = SnowflakeProfiler(
                 config, self.report, self.profiling_state_handler
@@ -701,7 +701,7 @@ def _process_database(
         for snowflake_schema in snowflake_db.schemas:
             yield from self._process_schema(snowflake_schema, db_name)
 
-        if self.config.profiling.enabled and self.db_tables:
+        if self.config.is_profiling_enabled() and self.db_tables:
             yield from self.profiler.get_workunits(snowflake_db, self.db_tables)
 
     def fetch_schemas_for_database(self, snowflake_db, db_name):