Skip to content

Commit

Permalink
fix(ingest): cache sql is_profiling_enabled method (#11665)
Browse files Browse the repository at this point in the history
Co-authored-by: Mayuri Nehate <[email protected]>
  • Loading branch information
hsheth2 and mayurinehate authored Oct 22, 2024
1 parent 7f6f18c commit 7ba77a9
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 2 deletions.
9 changes: 7 additions & 2 deletions metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@
"more_itertools",
}

cachetools_lib = {
"cachetools",
}

sql_common = (
{
# Required for all SQL sources.
Expand All @@ -138,6 +142,7 @@
# https://github.com/ipython/traitlets/issues/741
"traitlets<5.2.2",
"greenlet",
*cachetools_lib,
}
| usage_common
| sqlglot_lib
Expand Down Expand Up @@ -213,7 +218,7 @@
"pandas",
"cryptography",
"msal",
"cachetools",
*cachetools_lib,
} | classification_lib

trino = {
Expand Down Expand Up @@ -457,7 +462,7 @@
| sqlglot_lib
| classification_lib
| {"db-dtypes"} # Pandas extension data types
| {"cachetools"},
| cachetools_lib,
"s3": {*s3_base, *data_lake_profiling},
"gcs": {*s3_base, *data_lake_profiling},
"abs": {*abs_base, *data_lake_profiling},
Expand Down
10 changes: 10 additions & 0 deletions metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from abc import abstractmethod
from typing import Any, Dict, Optional

import cachetools
import cachetools.keys
import pydantic
from pydantic import Field
from sqlalchemy.engine import URL
Expand All @@ -27,6 +29,7 @@
StatefulIngestionConfigBase,
)
from datahub.ingestion.source_config.operation_config import is_profiling_enabled
from datahub.utilities.cachetools_keys import self_methodkey

logger: logging.Logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -115,6 +118,13 @@ class SQLCommonConfig(
# Custom Stateful Ingestion settings
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None

# TRICKY: The operation_config is time-dependent. Because we don't want to change
# whether or not we're running profiling mid-ingestion, we cache the result of this method.
# TODO: This decorator should be moved to the is_profiling_enabled(operation_config) method.
@cachetools.cached(
cache=cachetools.LRUCache(maxsize=1),
key=self_methodkey,
)
def is_profiling_enabled(self) -> bool:
return self.profiling.enabled and is_profiling_enabled(
self.profiling.operation_config
Expand Down
8 changes: 8 additions & 0 deletions metadata-ingestion/src/datahub/utilities/cachetools_keys.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from typing import Any

import cachetools.keys


def self_methodkey(self: Any, *args: Any, **kwargs: Any) -> Any:
# Keeps the id of self around
return cachetools.keys.hashkey(id(self), *args, **kwargs)

0 comments on commit 7ba77a9

Please sign in to comment.