Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

load analyzer configuration from file #1337

Merged
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
8a45ec6
initial version of loader
Mar 20, 2024
b41428c
Merge branch 'main' into feature/api_additions
omri374 Mar 20, 2024
847806b
addressed comments
Mar 20, 2024
07d093e
linting fixes
Mar 20, 2024
5db852f
Merge branch 'feature/api_additions' of github.com:RoeyBC/presidio in…
Mar 20, 2024
519cc05
re structured recognizers in yaml
Mar 29, 2024
1b326cc
Merge branch 'main' into feature/api_additions
omri374 Mar 30, 2024
6c28004
addressed comments and fixed predefined recognizers loading
Mar 31, 2024
37a0760
added engine provider to analyzer init
Mar 31, 2024
cdb7eb4
moved logic to recognizer registry provider
Mar 31, 2024
f4eba3e
some name fixes to recognizer provider
Mar 31, 2024
b4e9c85
added language support to recognizer registry
Apr 4, 2024
288ce70
fixed interface issues, added unit tests for providers
Apr 5, 2024
e4ab92d
Merge branch 'feature/api_additions' of github.com:RoeyBC/presidio in…
Apr 5, 2024
bd7ff75
fixed tests, addressed comments
Apr 10, 2024
3ee013b
Merge branch 'main' into feature/api_additions
omri374 Apr 11, 2024
da5a8d1
added yaml configuration to package, fixed linting rules
Apr 12, 2024
efce9d8
Merge branch 'feature/api_additions' of github.com:RoeyBC/presidio in…
Apr 12, 2024
aac0f78
move all conf file to a single location
Apr 15, 2024
bb03246
remove file from previous location
Apr 15, 2024
4212a77
Merge branch 'main' into feature/api_additions
SharonHart Apr 16, 2024
21e6ebc
Merge branch 'main' into feature/api_additions
omri374 Apr 18, 2024
b264637
merged from main, added default conf file for engine provider
Apr 18, 2024
3455e56
addressed some comments
Apr 18, 2024
01268ee
setup fixups
Apr 18, 2024
c755784
remove redundant line
Apr 18, 2024
1a6e909
fix long line
Apr 18, 2024
6cc6261
fixing linting errors
Apr 18, 2024
fa78986
Update presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py
omri374 Apr 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions presidio-analyzer/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
from flask import Flask, request, jsonify, Response
from werkzeug.exceptions import HTTPException

from presidio_analyzer.analyzer_engine import AnalyzerEngine
from presidio_analyzer.analyzer_request import AnalyzerRequest
from presidio_analyzer import AnalyzerEngine, AnalyzerEngineProvider, AnalyzerRequest

DEFAULT_PORT = "3000"

Expand All @@ -37,7 +36,7 @@ def __init__(self):
self.logger.setLevel(os.environ.get("LOG_LEVEL", self.logger.level))
self.app = Flask(__name__)
self.logger.info("Starting analyzer engine")
self.engine = AnalyzerEngine()
omri374 marked this conversation as resolved.
Show resolved Hide resolved
self.engine: AnalyzerEngine = AnalyzerEngineProvider(conf_file=os.path.abspath("conf/analyzer.yaml")).create_engine()
self.logger.info(WELCOME_MESSAGE)

@self.app.route("/health")
Expand Down
3 changes: 3 additions & 0 deletions presidio-analyzer/conf/analyzer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
supported_languages:
- en
default_score_threshold: 0
118 changes: 118 additions & 0 deletions presidio-analyzer/conf/default_recognizers.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
supported_languages:
omri374 marked this conversation as resolved.
Show resolved Hide resolved
- en

recognizers:
- name: UsBankRecognizer
supported_languages:
- en
type: predefined

- name: UsLicenseRecognizer
supported_languages:
- en
type: predefined

- name: UsItinRecognizer
supported_languages:
- en
type: predefined

- name: UsPassportRecognizer
supported_languages:
- en
type: predefined

- name: UsSsnRecognizer
supported_languages:
- en
type: predefined

- name: NhsRecognizer
supported_languages:
- en
type: predefined

- name: SgFinRecognizer
supported_languages:
- en
type: predefined

- name: AuAbnRecognizer
supported_languages:
- en
type: predefined

- name: AuAcnRecognizer
supported_languages:
- en
type: predefined

- name: AuTfnRecognizer
supported_languages:
- en
type: predefined

- name: AuMedicareRecognizer
supported_languages:
- en
type: predefined

- name: InPanRecognizer
supported_languages:
- en
type: predefined

- name: InAadhaarRecognizer
supported_languages:
- en
type: predefined

- name: InVehicleRegistrationRecognizer
supported_languages:
- en
type: predefined

- name: EsNifRecognizer
supported_languages:
- es
type: predefined

- name: ItDriverLicenseRecognizer
supported_languages:
- it
type: predefined

- name: ItFiscalCodeRecognizer
supported_languages:
- it
type: predefined

- name: ItVatCodeRecognizer
supported_languages:
- it
type: predefined

- name: ItIdentityCardRecognizer
supported_languages:
- it
type: predefined

- name: ItPassportRecognizer
supported_languages:
- it
type: predefined

- name: PlPeselRecognizer
supported_languages:
- pl
type: predefined

- CreditCardRecognizer
omri374 marked this conversation as resolved.
Show resolved Hide resolved
- CryptoRecognizer
- DateRecognizer
- EmailRecognizer
- IbanRecognizer
- IpRecognizer
- MedicalLicenseRecognizer
- PhoneRecognizer
- UrlRecognizer
2 changes: 2 additions & 0 deletions presidio-analyzer/presidio_analyzer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from presidio_analyzer.remote_recognizer import RemoteRecognizer
from presidio_analyzer.recognizer_registry import RecognizerRegistry
from presidio_analyzer.analyzer_engine import AnalyzerEngine
from presidio_analyzer.analyzer_engine_provider import AnalyzerEngineProvider
from presidio_analyzer.batch_analyzer_engine import BatchAnalyzerEngine
from presidio_analyzer.analyzer_request import AnalyzerRequest
from presidio_analyzer.context_aware_enhancers import ContextAwareEnhancer
Expand Down Expand Up @@ -45,6 +46,7 @@
"RemoteRecognizer",
"RecognizerRegistry",
"AnalyzerEngine",
"AnalyzerEngineProvider",
"AnalyzerRequest",
"ContextAwareEnhancer",
"LemmaContextAwareEnhancer",
Expand Down
16 changes: 10 additions & 6 deletions presidio-analyzer/presidio_analyzer/analyzer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
from presidio_analyzer import (
RecognizerRegistry,
RecognizerResult,
EntityRecognizer,
EntityRecognizer
)
from presidio_analyzer.app_tracer import AppTracer
from presidio_analyzer.context_aware_enhancers import (
ContextAwareEnhancer,
LemmaContextAwareEnhancer,
)
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider, NlpArtifacts
from presidio_analyzer.recognizer_registry_provider import RecognizerRegistryProvider

logger = logging.getLogger("presidio-analyzer")

Expand Down Expand Up @@ -57,9 +58,6 @@ def __init__(
provider = NlpEngineProvider()
nlp_engine = provider.create_engine()

if not registry:
logger.info("registry not provided, creating default.")
registry = RecognizerRegistry()
if not app_tracer:
app_tracer = AppTracer()
self.app_tracer = app_tracer
Expand All @@ -70,14 +68,20 @@ def __init__(
if not self.nlp_engine.is_loaded():
self.nlp_engine.load()

self.registry = registry
if not registry:
logger.info("registry not provided, creating default.")
provider = RecognizerRegistryProvider(registry_configuration={"supported_languages":self.supported_languages})
registry = provider.create_recognizer_registry()
registry.add_nlp_recognizer(nlp_engine=self.nlp_engine)

# load all recognizers
# added to support the previous interface
if not registry.recognizers:
registry.load_predefined_recognizers(
nlp_engine=self.nlp_engine, languages=self.supported_languages
)

self.registry = registry

self.log_decision_process = log_decision_process
self.default_score_threshold = default_score_threshold

Expand Down
80 changes: 80 additions & 0 deletions presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import yaml
roeybc marked this conversation as resolved.
Show resolved Hide resolved
import os
import logging
from pathlib import Path
from typing import Optional, Union, List

from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngineProvider, NlpEngine
from presidio_analyzer.recognizer_registry_provider import RecognizerRegistryProvider

logger = logging.getLogger("presidio-analyzer")


class AnalyzerEngineProvider:
"""
Utility function for loading Presidio Analyzer.

Use this class to load presidio analyzer engine from a yaml file
"""

def __init__(self, conf_file: Optional[Union[Path, str]] = None):
self.configuration = {}
if not conf_file or not os.path.exists(conf_file):
logger.warning(
"configuration file is missing. "
"Using default configuration for analyzer engine"
)
return

self.configuration = yaml.safe_load(open(conf_file))
return

def create_engine(self) -> AnalyzerEngine:
"""
loads Presidio Analyzer from yaml configuration file.

:return: analyzer engine initialized with yaml configuration
"""

nlp_engine = self._load_nlp_engine()
supported_languages = self.configuration.get("supported_languages", ["en"])
default_score_threshold = self.configuration.get("default_score_threshold", 0)

registry = self._load_recognizer_registry(supported_languages)

analyzer = AnalyzerEngine(
nlp_engine=nlp_engine,
registry=registry,
supported_languages=supported_languages,
default_score_threshold=default_score_threshold
)

analyzer.registry.add_nlp_recognizer(nlp_engine=analyzer.nlp_engine)

return analyzer

def _load_recognizer_registry(self,
supported_languages: Optional[List[str]] = None
) -> RecognizerRegistry:
if "recognizer_registry" not in self.configuration:
logger.warning(
"configuration file is missing 'recognizer_registry'. "
"Using default configuration for recognizer registry"
)
registry_configuration = self.configuration.get("recognizer_registry", {})
provider = RecognizerRegistryProvider(
registry_configuration={**registry_configuration,
"supported_languages": supported_languages})
return provider.create_recognizer_registry()

def _load_nlp_engine(self) -> NlpEngine:
omri374 marked this conversation as resolved.
Show resolved Hide resolved
if "nlp_configuration" not in self.configuration:
logger.warning(
"configuration file is missing 'nlp_configuration'."
"Using default configuration for nlp engine"
)
return None
nlp_configuration = self.configuration["nlp_configuration"]
provider = NlpEngineProvider(nlp_configuration=nlp_configuration)
return provider.create_engine()
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
from .sg_uen_recognizer import SgUenRecognizer
from .fi_personal_identity_code_recognizer import FiPersonalIdentityCodeRecognizer

PREDEFINED_RECOGNIZERS = ["PhoneRecognizer", "CreditCardRecognizer", "CryptoRecognizer", "DateRecognizer", "EmailRecognizer", "IpRecognizer", "IbanRecognizer", "MedicalLicenseRecognizer", "UrlRecognizer"]

NLP_RECOGNIZERS = {
"spacy": SpacyRecognizer,
"stanza": StanzaRecognizer,
Expand Down
31 changes: 21 additions & 10 deletions presidio-analyzer/presidio_analyzer/recognizer_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,32 @@ def __init__(
self,
recognizers: Optional[Iterable[EntityRecognizer]] = None,
global_regex_flags: Optional[int] = re.DOTALL | re.MULTILINE | re.IGNORECASE,
supported_languages: Optional[List[str]] = None
):
if recognizers:
self.recognizers = recognizers
else:
self.recognizers = []
self.global_regex_flags = global_regex_flags
self.supported_languages = supported_languages

def _create_nlp_recognizer(self, nlp_engine: NlpEngine = None, supported_language: str = None) -> SpacyRecognizer:
nlp_recognizer = self._get_nlp_recognizer(nlp_engine)

if nlp_engine:
return nlp_recognizer(
supported_language=supported_language,
supported_entities=nlp_engine.get_supported_entities(),
)

return nlp_recognizer(supported_language=supported_language)

def add_nlp_recognizer(self, nlp_engine: NlpEngine = None) -> None:
supported_languages = set([recognizer.supported_language for recognizer in self.recognizers])

self.recognizers.extend(
[self._create_nlp_recognizer(nlp_engine=nlp_engine, supported_language=supported_language) for supported_language in supported_languages]
)

def load_predefined_recognizers(
self, languages: Optional[List[str]] = None, nlp_engine: NlpEngine = None
Expand All @@ -87,8 +107,6 @@ def load_predefined_recognizers(
if not languages:
languages = ["en"]

nlp_recognizer = self._get_nlp_recognizer(nlp_engine)

recognizers_map = {
"en": [
UsBankRecognizer,
Expand Down Expand Up @@ -142,14 +160,7 @@ def load_predefined_recognizers(
for rc in recognizers_map.get("ALL", [])
]
self.recognizers.extend(all_recognizers)
if nlp_engine:
nlp_recognizer_inst = nlp_recognizer(
supported_language=lang,
supported_entities=nlp_engine.get_supported_entities(),
)
else:
nlp_recognizer_inst = nlp_recognizer(supported_language=lang)
self.recognizers.append(nlp_recognizer_inst)
self.add_nlp_recognizer(nlp_engine=nlp_engine)

@staticmethod
def _get_nlp_recognizer(
Expand Down
Loading
Loading