Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

load analyzer configuration from file #1337

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
8a45ec6
initial version of loader
Mar 20, 2024
b41428c
Merge branch 'main' into feature/api_additions
omri374 Mar 20, 2024
847806b
addressed comments
Mar 20, 2024
07d093e
linting fixes
Mar 20, 2024
5db852f
Merge branch 'feature/api_additions' of github.com:RoeyBC/presidio in…
Mar 20, 2024
519cc05
re structured recognizers in yaml
Mar 29, 2024
1b326cc
Merge branch 'main' into feature/api_additions
omri374 Mar 30, 2024
6c28004
addressed comments and fixed predefined recognizers loading
Mar 31, 2024
37a0760
added engine provider to analyzer init
Mar 31, 2024
cdb7eb4
moved logic to recognizer registry provider
Mar 31, 2024
f4eba3e
some name fixes to recognizer provider
Mar 31, 2024
b4e9c85
added language support to recognizer registry
Apr 4, 2024
288ce70
fixed interface issues, added unit tests for providers
Apr 5, 2024
e4ab92d
Merge branch 'feature/api_additions' of github.com:RoeyBC/presidio in…
Apr 5, 2024
bd7ff75
fixed tests, addressed comments
Apr 10, 2024
3ee013b
Merge branch 'main' into feature/api_additions
omri374 Apr 11, 2024
da5a8d1
added yaml configuration to package, fixed linting rules
Apr 12, 2024
efce9d8
Merge branch 'feature/api_additions' of github.com:RoeyBC/presidio in…
Apr 12, 2024
aac0f78
move all conf file to a single location
Apr 15, 2024
bb03246
remove file from previous location
Apr 15, 2024
4212a77
Merge branch 'main' into feature/api_additions
SharonHart Apr 16, 2024
21e6ebc
Merge branch 'main' into feature/api_additions
omri374 Apr 18, 2024
b264637
merged from main, added default conf file for engine provider
Apr 18, 2024
3455e56
addressed some comments
Apr 18, 2024
01268ee
setup fixups
Apr 18, 2024
c755784
remove redundant line
Apr 18, 2024
1a6e909
fix long line
Apr 18, 2024
6cc6261
fixing linting errors
Apr 18, 2024
fa78986
Update presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py
omri374 Apr 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion presidio-analyzer/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from presidio_analyzer.analyzer_engine import AnalyzerEngine
from presidio_analyzer.analyzer_request import AnalyzerRequest
from presidio_analyzer.analyzer_engine_provider import AnalyzerEngineProvider
roeybc marked this conversation as resolved.
Show resolved Hide resolved

DEFAULT_PORT = "3000"

Expand All @@ -37,7 +38,7 @@ def __init__(self):
self.logger.setLevel(os.environ.get("LOG_LEVEL", self.logger.level))
self.app = Flask(__name__)
self.logger.info("Starting analyzer engine")
self.engine = AnalyzerEngine()
omri374 marked this conversation as resolved.
Show resolved Hide resolved
self.engine: AnalyzerEngine = AnalyzerEngineProvider(conf_file=os.path.abspath("conf/analyzer.yaml")).create_engine()
self.logger.info(WELCOME_MESSAGE)

@self.app.route("/health")
Expand Down
52 changes: 52 additions & 0 deletions presidio-analyzer/conf/analyzer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
recognizer_registry:
recognizers:
omri374 marked this conversation as resolved.
Show resolved Hide resolved
-
name: CreditCardRecognizer
supported_language: en

-
name: CreditCardRecognizer
supported_language: es

- ItFiscalCodeRecognizer

supported_languages:
- en
- es
omri374 marked this conversation as resolved.
Show resolved Hide resolved

roeybc marked this conversation as resolved.
Show resolved Hide resolved
default_score_threshold: 0.7

nlp_configuration:
nlp_engine_name: transformers
models:
-
lang_code: en
model_name:
spacy: en_core_web_sm
transformers: StanfordAIMI/stanford-deidentifier-base

ner_model_configuration:
labels_to_ignore:
- O
aggregation_strategy: simple # "simple", "first", "average", "max"
stride: 16
alignment_mode: strict # "strict", "contract", "expand"
model_to_presidio_entity_mapping:
PER: PERSON
LOC: LOCATION
ORG: ORGANIZATION
AGE: AGE
ID: ID
EMAIL: EMAIL
PATIENT: PERSON
STAFF: PERSON
HOSP: ORGANIZATION
PATORG: ORGANIZATION
DATE: DATE_TIME
PHONE: PHONE_NUMBER
HCW: PERSON
HOSPITAL: ORGANIZATION

low_confidence_score_multiplier: 0.4
low_score_entity_names:
- ID
93 changes: 93 additions & 0 deletions presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import yaml
roeybc marked this conversation as resolved.
Show resolved Hide resolved
import os
import logging
from pathlib import Path
from typing import Optional, Union

from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngineProvider, NlpEngine
import presidio_analyzer.recognizer_registry

logger = logging.getLogger("presidio-analyzer")


class AnalyzerEngineProvider:
"""
Utility function for loading Presidio Analyzer.

Use this class to load presidio analyzer engine from a yaml file
"""

def __init__(self, conf_file: Optional[Union[Path, str]] = None):
self.configuration = {}
if not os.path.exists(conf_file):
roeybc marked this conversation as resolved.
Show resolved Hide resolved
logger.warning(
"configuration file is missing. Using default configuration for analyzer engine"
)
return

self.configuration = yaml.safe_load(open(conf_file))
return


def create_engine(self) -> AnalyzerEngine:
"""
loads Presidio Analyzer from yaml configuration file.

:return: analyzer engine initialized with yaml configuration
"""

nlp_engine = self._load_nlp_engine()
registry = self._load_recognizer_registry()
supported_languages = self.configuration.get("supported_languages", None)
roeybc marked this conversation as resolved.
Show resolved Hide resolved
default_score_threshold = self.configuration.get("default_score_threshold", 0)

analyzer = AnalyzerEngine(
nlp_engine=nlp_engine,
registry=registry,
supported_languages=supported_languages,
default_score_threshold=default_score_threshold
)

return analyzer

def _get_name(self, recognizer) ->str :
if isinstance(recognizer, str):
return recognizer
return recognizer["name"]

def _get_language(self, recognizer) -> str:
if isinstance(recognizer, str):
roeybc marked this conversation as resolved.
Show resolved Hide resolved
return "en"
return recognizer["supported_language"]

def _load_recognizer_registry(self) -> RecognizerRegistry:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def _load_recognizer_registry(self) -> RecognizerRegistry:
def _load_recognizer_registry(self) -> Optional[RecognizerRegistry]:

if "recognizer_registry" not in self.configuration:
logger.warning(
"configuration file is missing 'recognizer_registry'. Using default configuration for recognizer registry"
)
return None
recognizer_registry = self.configuration["recognizer_registry"]
if isinstance(recognizer_registry, str):
if recognizer_registry!="predefined":
roeybc marked this conversation as resolved.
Show resolved Hide resolved
logger.warning(
"recognizer_registry contains unidentified value. Using default configuration for recognizer registry"
)
return None

recognizers=recognizer_registry["recognizers"]
recognizer_instances = []
for recognizer in recognizers:
recognizer_instances.append(getattr(presidio_analyzer.predefined_recognizers, self._get_name(recognizer), None)(supported_language=self._get_language(recognizer)))

return RecognizerRegistry(recognizers=recognizer_instances)

def _load_nlp_engine(self) -> NlpEngine:
omri374 marked this conversation as resolved.
Show resolved Hide resolved
if "nlp_configuration" not in self.configuration:
logger.warning(
"configuration file is missing 'nlp_configuration'. Using default configuration for nlp engine"
)
return None
nlp_configuration = self.configuration["nlp_configuration"]
provider = NlpEngineProvider(nlp_configuration=nlp_configuration)
return provider.create_engine()
Loading