From 58c397b56ff45b2607265ed5b30f2bc4ecc3cd49 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Thu, 2 Jan 2025 12:26:13 +0200 Subject: [PATCH 1/8] gliner recognizer + docs --- docs/samples/index.md | 9 +- docs/samples/python/gliner.md | 76 ++++++++ mkdocs.yml | 1 + .../predefined_recognizers/__init__.py | 2 + .../gliner_recognizer.py | 166 ++++++++++++++++++ presidio-analyzer/pyproject.toml | 6 + .../tests/test_gliner_recognizer.py | 118 +++++++++++++ 7 files changed, 374 insertions(+), 4 deletions(-) create mode 100644 docs/samples/python/gliner.md create mode 100644 presidio-analyzer/presidio_analyzer/predefined_recognizers/gliner_recognizer.py create mode 100644 presidio-analyzer/tests/test_gliner_recognizer.py diff --git a/docs/samples/index.md b/docs/samples/index.md index d1de81152..c0e2b1317 100644 --- a/docs/samples/index.md +++ b/docs/samples/index.md @@ -25,14 +25,15 @@ | Usage | Text | Python file | [Passing a lambda as a Presidio anonymizer using Faker](python/example_custom_lambda_anonymizer.py)| | Usage | Text | Python file | [Synthetic data generation with OpenAI](python/synth_data_with_openai.ipynb)| | Usage | Text | Python file | [Keeping some entities from being anonymized](python/keep_entities.ipynb)| -| Usage | Text | LiteLLM Proxy | [PII Masking LLM calls across Anthropic/Gemini/Bedrock/Azure, etc.](docker/litellm.md)| +| Usage | Text | LiteLLM Proxy | [PII Masking LLM calls across Anthropic/Gemini/Bedrock/Azure, etc.](docker/litellm.md)| | Usage | Text | Python Notebook | [YAML based no-code configuration](python/no_code_config.ipynb) | +| Usage | Text | Python file | [Using GLiNER within Presidio](gliner.md) | | Usage | | REST API (postman) | [Presidio as a REST endpoint](docker/index.md)| | Deployment | | App Service | [Presidio with App Service](deployments/app-service/index.md)| | Deployment | | Kubernetes | [Presidio with Kubernetes](deployments/k8s/index.md)| | Deployment | | Spark/Azure Databricks | [Presidio with Spark](deployments/spark/index.md)| | Deployment | | Azure Data Factory with App Service | [ETL for small dataset](deployments/data-factory/presidio-data-factory.md#option-1-presidio-as-an-http-rest-endpoint) | | Deployment | | Azure Data Factory with Databricks | [ETL for large datasets](deployments/data-factory/presidio-data-factory.md#option-2-presidio-on-azure-databricks) | -| ADF Pipeline | | Azure Data Factory | [Add Presidio as an HTTP service to your Azure Data Factory](deployments/data-factory/presidio-data-factory-template-gallery-http.md) | -| ADF Pipeline | | Azure Data Factory | [Add Presidio on Databricks to your Azure Data Factory](deployments/data-factory/presidio-data-factory-template-gallery-databricks.md) | -| Demo | | Streamlit app | [Create a simple demo app using Streamlit](python/streamlit/index.md) +| ADF Pipeline | | Azure Data Factory | [Add Presidio as an HTTP service to your Azure Data Factory](deployments/data-factory/presidio-data-factory-template-gallery-http.md) | +| ADF Pipeline | | Azure Data Factory | [Add Presidio on Databricks to your Azure Data Factory](deployments/data-factory/presidio-data-factory-template-gallery-databricks.md) | +| Demo | | Streamlit app | [Create a simple demo app using Streamlit](python/streamlit/index.md) diff --git a/docs/samples/python/gliner.md b/docs/samples/python/gliner.md new file mode 100644 index 000000000..3d92bdda5 --- /dev/null +++ b/docs/samples/python/gliner.md @@ -0,0 +1,76 @@ +# Using GLiNER within Presidio + +## What is GLiNER + +GLiNER is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). It provides a practical alternative to traditional NER models, which are limited to predefined entities, and Large Language Models (LLMs) that, despite their flexibility, are costly and large for resource-constrained scenarios. + +Paper: [GLiNER: Generalist Model for Named Entity Recognition using Bidirectional Transformer](https://arxiv.org/abs/2311.08526) + +Since GLiNER takes as input both the sentence/text and entity types, it can be used for zero-shot named entity recognition. This means that it can recognize entities that were not seen during training. + +## PII Detection with GLiNER + +GLiNER has a trained PII detection model: 🔍 [`urchade/gliner_multi_pii-v1`](https://huggingface.co/urchade/gliner_multi_pii-v1) *(Apache 2.0)* + +This model is capable of recognizing various types of *personally identifiable information* (PII), including but not limited to these entity types: `person`, `organization`, `phone number`, `address`, `passport number`, `email`, `credit card number`, `social security number`, `health insurance id number`, `date of birth`, `mobile phone number`, `bank account number`, `medication`, `cpf`, `driver's license number`, `tax identification number`, `medical condition`, `identity card number`, `national id number`, `ip address`, `email address`, `iban`, `credit card expiration date`, `username`, `health insurance number`, `registration number`, `student id number`, `insurance number`, `flight number`, `landline phone number`, `blood type`, `cvv`, `reservation number`, `digital signature`, `social media handle`, `license plate number`, `cnpj`, `postal code`, `passport_number`, `serial number`, `vehicle registration number`, `credit card brand`, `fax number`, `visa number`, `insurance company`, `identity document number`, `transaction number`, `national health insurance number`, `cvc`, `birth certificate number`, `train ticket number`, `passport expiration date`, and `social_security_number`. + +## Using GLiNER with Presidio + +Presidio has a built-in `EntityRecognizer` for GLiNER: `GLiNERRecognizer`. This recognizer can be used to detect PII entities in text using the GLiNER model. + +### Installation + +To use GLiNER with Presidio, you need to install the `presidio-analyzer` with the `gliner` extra: + +```bash +pip install 'presidio-analyzer[gliner]' +``` + +### Example + +```python +from presidio_analyzer import AnalyzerEngine +from presidio_analyzer.nlp_engine import NlpEngineProvider +from presidio_analyzer.predefined_recognizers import GLiNERRecognizer + + +# Load a small spaCy model as we don't need spaCy's NER +nlp_engine = NlpEngineProvider( + nlp_configuration={ + "nlp_engine_name": "spacy", + "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}], + } +) + +# Create an analyzer engine +analyzer_engine = AnalyzerEngine() + +# Define and create the GLiNER recognizer +entity_mapping = { + "person": "PERSON", + "name": "PERSON", + "organization": "ORGANIZATION", + "location": "LOCATION" +} + +gliner_recognizer = GLiNERRecognizer( + model_name="urchade/gliner_multi_pii-v1", + entity_mapping=entity_mapping, + flat_ner=False, + multi_label=True, + map_location="cpu", +) + +# Add the GLiNER recognizer to the registry +analyzer_engine.registry.add_recognizer(gliner_recognizer) + +# Remove the spaCy recognizer to avoid NER coming from spaCy +analyzer_engine.registry.remove_recognizer("SpacyRecognizer") + +# Analyze text +results = analyzer_engine.analyze( + text="Hello, my name is Rafi Mor, I'm from Binyamina and I work at Microsoft. ", language="en" +) + +print(results) +``` \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index bf0f18a93..3b3fc5618 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -90,6 +90,7 @@ nav: - Using Flair as an external PII model: https://github.com/microsoft/presidio/blob/main/docs/samples/python/flair_recognizer.py - Using Span Marker as an external PII model: https://github.com/microsoft/presidio/blob/main/docs/samples/python/span_marker_recognizer.py - Using Transformers as an external PII model: samples/python/transformers_recognizer/index.md + - Using GLiNER as an external PII model: samples/python/gliner.md - Pseudonymization (replace PII values using mappings): samples/python/pseudonymization.ipynb - Passing a lambda as a Presidio anonymizer using Faker: https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_custom_lambda_anonymizer.py - Synthetic data generation with OpenAI: samples/python/synth_data_with_openai.ipynb diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index edfe344ae..052978d53 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -17,6 +17,7 @@ from .es_nie_recognizer import EsNieRecognizer from .es_nif_recognizer import EsNifRecognizer from .fi_personal_identity_code_recognizer import FiPersonalIdentityCodeRecognizer +from .gliner_recognizer import GLiNERRecognizer from .iban_recognizer import IbanRecognizer from .in_aadhaar_recognizer import InAadhaarRecognizer from .in_pan_recognizer import InPanRecognizer @@ -96,6 +97,7 @@ "ItIdentityCardRecognizer", "ItPassportRecognizer", "InPanRecognizer", + "GLiNERRecognizer", "PlPeselRecognizer", "AzureAILanguageRecognizer", "InAadhaarRecognizer", diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/gliner_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/gliner_recognizer.py new file mode 100644 index 000000000..bd535c0f3 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/gliner_recognizer.py @@ -0,0 +1,166 @@ +import json +import logging +from typing import Dict, List, Optional + +from presidio_analyzer import ( + AnalysisExplanation, + LocalRecognizer, + RecognizerResult, +) +from presidio_analyzer.nlp_engine import NerModelConfiguration, NlpArtifacts + +try: + from gliner import GLiNER, GLiNERConfig +except ImportError: + GLiNER = None + GLiNERConfig = None + + +logger = logging.getLogger("presidio-analyzer") + + +class GLiNERRecognizer(LocalRecognizer): + """GLiNER model based entity recognizer.""" + + def __init__( + self, + supported_entities: Optional[List[str]] = None, + name: str = "GLiNERRecognizer", + supported_language: str = "en", + version: str = "0.0.1", + context: Optional[List[str]] = None, + entity_mapping: Optional[Dict[str, str]] = None, + model_name: str = "urchade/gliner_multi_pii-v1", + flat_ner: bool = True, + multi_label: bool = False, + threshold: float = 0.30, + map_location: str = "cpu", + ): + """GLiNER model based entity recognizer. + + The model is based on the GLiNER library. + + :param supported_entities: List of supported entities for this recognizer. + If None, all entities in Presidio's default configuration will be used. + see `NerModelConfiguration` + :param name: Name of the recognizer + :param supported_language: Language code to use for the recognizer + :param version: Version of the recognizer + :param context: N/A for this recognizer + :param model_name: The name of the GLiNER model to load + :param flat_ner: Whether to use flat NER or not (see GLiNER's documentation) + :param multi_label: Whether to use multi-label classification or not + (see GLiNER's documentation) + :param threshold: The threshold for the model's output + (see GLiNER's documentation) + :param map_location: The device to use for the model + + + """ + + if entity_mapping: + if supported_entities: + raise ValueError( + "entity_mapping and supported_entities cannot be used together" + ) + + self.model_to_presidio_entity_mapping = entity_mapping + else: + if not supported_entities: + logger.info( + "No supported entities provided, " + "using default entities from NerModelConfiguration" + ) + self.model_to_presidio_entity_mapping = ( + NerModelConfiguration().model_to_presidio_entity_mapping + ) + else: + self.model_to_presidio_entity_mapping = { + entity: entity for entity in supported_entities + } + + logger.info("Using entity mapping %s", + json.dumps(entity_mapping, indent=2)) + supported_entities = list(set(self.model_to_presidio_entity_mapping.values())) + self.model_name = model_name + self.map_location = map_location + self.flat_ner = flat_ner + self.multi_label = multi_label + self.threshold = threshold + + self.gliner = None + + super().__init__( + supported_entities=supported_entities, + name=name, + supported_language=supported_language, + version=version, + context=context, + ) + + self.gliner_labels = list(self.model_to_presidio_entity_mapping.keys()) + + def load(self) -> None: + """Load the GLiNER model.""" + + self.gliner = GLiNER.from_pretrained(self.model_name) + + def analyze( + self, + text: str, + entities: List[str], + nlp_artifacts: Optional[NlpArtifacts] = None, + ) -> List[RecognizerResult]: + """Analyze text to identify entities using a GLiNER model. + + :param text: The text to be analyzed + :param entities: The list of entities this recognizer is requested to return + :param nlp_artifacts: N/A for this recognizer + """ + + # combine the input labels as this model allows for ad-hoc labels + labels = self.__create_input_labels(entities) + + predictions = self.gliner.predict_entities( + text=text, + labels=labels, + flat_ner=self.flat_ner, + threshold=self.threshold, + multi_label=self.multi_label, + ) + recognizer_results = [] + for prediction in predictions: + presidio_entity = self.model_to_presidio_entity_mapping.get( + prediction["label"], prediction["label"] + ) + if entities and presidio_entity not in entities: + continue + + analysis_explanation = AnalysisExplanation( + recognizer=self.name, + original_score=prediction["score"], + textual_explanation=f"Identified as {presidio_entity} by GLiNER", + ) + + recognizer_results.append( + RecognizerResult( + entity_type=presidio_entity, + start=prediction["start"], + end=prediction["end"], + score=prediction["score"], + analysis_explanation=analysis_explanation, + ) + ) + + return recognizer_results + + def __create_input_labels(self, entities): + """Append the entities requested by the user to the list of labels if it's not there.""" # noqa: E501 + labels = self.gliner_labels + for entity in entities: + if ( + entity not in self.model_to_presidio_entity_mapping.values() + and entity not in self.gliner_labels + ): + labels.append(entity) + return labels diff --git a/presidio-analyzer/pyproject.toml b/presidio-analyzer/pyproject.toml index 0cf6923d9..0b800909c 100644 --- a/presidio-analyzer/pyproject.toml +++ b/presidio-analyzer/pyproject.toml @@ -34,6 +34,7 @@ stanza = { version = "*", optional = true } spacy_stanza = { version = "*", optional = true } azure-ai-textanalytics = { version = "*", optional = true } azure-core = { version = "*", optional = true } +gliner = {version = ">=0.2.13,<1.0.0", optional = true} transformers = { version = "*", optional = true } huggingface_hub = { version = "*", optional = true } gunicorn = {version = "*", optional = true} @@ -52,6 +53,11 @@ azure-ai-language = [ "azure-ai-textanalytics", "azure-core", ] +gliner = [ + "transformers", + "huggingface_hub", + "gliner" +] [tool.poetry.group.dev.dependencies] pip = "*" diff --git a/presidio-analyzer/tests/test_gliner_recognizer.py b/presidio-analyzer/tests/test_gliner_recognizer.py new file mode 100644 index 000000000..f5d8b0ace --- /dev/null +++ b/presidio-analyzer/tests/test_gliner_recognizer.py @@ -0,0 +1,118 @@ +import pytest +from unittest.mock import MagicMock, patch + +from presidio_analyzer.predefined_recognizers import GLiNERRecognizer + + +@pytest.fixture +def mock_gliner(): + """ + Fixture to mock GLiNER class and its methods. + """ + # Mock the GLiNER class and its methods + mock_gliner_instance = MagicMock() + # Mock the from_pretrained method to return the mock instance + with patch("gliner.GLiNER.from_pretrained", return_value=mock_gliner_instance): + yield mock_gliner_instance + + +def test_analyze_passed_entities_are_subset_of_entity_mapping( + mock_gliner +): + # Mock GLiNER predict_entities + mock_gliner.predict_entities.return_value = [ + {"label": "person", "start": 11, "end": 19, "score": 0.95}, + {"label": "location", "start": 33, "end": 41, "score": 0.85}, + {"label": "org", "start": 313, "end": 411, "score": 0.85}, + ] + + entity_mapping = { + "person": "PERSON", + "organization": "ORG", + "location": "LOC", + } + + gliner_recognizer = GLiNERRecognizer( + entity_mapping=entity_mapping, + ) + + gliner_recognizer.gliner = mock_gliner + text = "My name is John Doe from Seattle." + entities = ["PERSON", "LOC"] + + results = gliner_recognizer.analyze(text, entities) + + # Check the number of results + assert len(results) == 2 + + # Check the first result + assert results[0].entity_type == "PERSON" + assert results[0].start == 11 + assert results[0].end == 19 + assert results[0].score == pytest.approx(0.95, rel=1e-2) + + # Check the second result + assert results[1].entity_type == "LOC" + assert results[1].start == 33 + assert results[1].end == 41 + assert results[1].score == pytest.approx(0.85, rel=1e-2) + + +def test_analyze_with_unsupported_entity(mock_gliner): + # Mock GLiNER predict_entities + mock_gliner.gliner.predict_entities.return_value = [ + {"label": "BIRD", "start": 0, "end": 5, "score": 0.75}, + ] + + text = "Unknown entity." + entities = ["PERSON", "LOC"] + + gliner_recognizer = GLiNERRecognizer( + supported_entities=entities, + ) + + results = gliner_recognizer.analyze(text, entities) + + # Should filter out unsupported entities + assert len(results) == 0 + + +def test_analyze_with_entity_mapping(mock_gliner): + # Mock GLiNER predict_entities + mock_gliner.predict_entities.return_value = [ + {"label": "organization", "start": 10, "end": 20, "score": 0.90}, + ] + + text = "Works at Microsoft." + entity_mapping = {"organization": "ORG"} + + gliner_recognizer = GLiNERRecognizer( + entity_mapping=entity_mapping, + ) + + results = gliner_recognizer.analyze(text, ["ORG"]) + + # Check mapping from 'organization' to 'ORG' + assert len(results) == 1 + assert results[0].entity_type == "ORG" + assert results[0].start == 10 + assert results[0].end == 20 + assert results[0].score == pytest.approx(0.90, rel=1e-2) + + +def test_analyze_with_no_entities(mock_gliner): + # Mock GLiNER predict_entities + mock_gliner.predict_entities.return_value = [] + + text = "No entities here." + entities = [] + + + gliner_recognizer = GLiNERRecognizer( + supported_entities=["ORG", "LOC", "PER"], + ) + + results = gliner_recognizer.analyze(text, entities) + + # Should return no results + assert len(results) == 0 From 564aceba96b2f44f80be689267edbdc93dc76a79 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Thu, 2 Jan 2025 12:26:21 +0200 Subject: [PATCH 2/8] updates to docs --- docs/samples/index.md | 2 +- mkdocs.yml | 86 +++++++++++++++++++++---------------------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/docs/samples/index.md b/docs/samples/index.md index c0e2b1317..1dc104b98 100644 --- a/docs/samples/index.md +++ b/docs/samples/index.md @@ -27,7 +27,7 @@ | Usage | Text | Python file | [Keeping some entities from being anonymized](python/keep_entities.ipynb)| | Usage | Text | LiteLLM Proxy | [PII Masking LLM calls across Anthropic/Gemini/Bedrock/Azure, etc.](docker/litellm.md)| | Usage | Text | Python Notebook | [YAML based no-code configuration](python/no_code_config.ipynb) | -| Usage | Text | Python file | [Using GLiNER within Presidio](gliner.md) | +| Usage | Text | Python file | [Using GLiNER within Presidio](python/gliner.md) | | Usage | | REST API (postman) | [Presidio as a REST endpoint](docker/index.md)| | Deployment | | App Service | [Presidio with App Service](deployments/app-service/index.md)| | Deployment | | Kubernetes | [Presidio with Kubernetes](deployments/k8s/index.md)| diff --git a/mkdocs.yml b/mkdocs.yml index 3b3fc5618..0bac05798 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -74,49 +74,49 @@ nav: - Presidio Structured Python API: api/structured_python.md - REST API reference: https://microsoft.github.io/presidio/api-docs/api-docs.html" target="_blank - Samples: - - Usage: - - Home: samples/index.md - - Text: - - Presidio Basic Usage Notebook: samples/python/presidio_notebook.ipynb - - Customizing Presidio Analyzer: samples/python/customizing_presidio_analyzer.ipynb - - Configuring The NLP engine: samples/python/ner_model_configuration.ipynb - - Encrypting and Decrypting identified entities: samples/python/encrypt_decrypt.ipynb - - Getting the identified entity value using a custom Operator: samples/python/getting_entity_values.ipynb - - Anonymizing known values: samples/python/Anonymizing known values.ipynb - - Keeping some entities from being anonymized: samples/python/keep_entities.ipynb - - Integrating with external services: samples/python/integrating_with_external_services.ipynb - - Remote Recognizer: https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_remote_recognizer.py - - Azure AI Language as a Remote Recognizer: samples/python/text_analytics/index.md - - Using Flair as an external PII model: https://github.com/microsoft/presidio/blob/main/docs/samples/python/flair_recognizer.py - - Using Span Marker as an external PII model: https://github.com/microsoft/presidio/blob/main/docs/samples/python/span_marker_recognizer.py - - Using Transformers as an external PII model: samples/python/transformers_recognizer/index.md - - Using GLiNER as an external PII model: samples/python/gliner.md - - Pseudonymization (replace PII values using mappings): samples/python/pseudonymization.ipynb - - Passing a lambda as a Presidio anonymizer using Faker: https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_custom_lambda_anonymizer.py - - Synthetic data generation with OpenAI: samples/python/synth_data_with_openai.ipynb - - YAML based no-code configuration: samples/python/no_code_config.ipynb - - Data: - - Analyzing structured / semi-structured data in batch: samples/python/batch_processing.ipynb - - Presidio Structured Basic Usage Notebook: samples/python/example_structured.ipynb - - Analyze and Anonymize CSV file: https://github.com/microsoft/presidio/blob/main/docs/samples/python/process_csv_file.py - - Images: - - Redacting Text PII from DICOM images: samples/python/example_dicom_image_redactor.ipynb - - Using an allow list with image redaction: samples/python/image_redaction_allow_list_approach.ipynb - - Plot custom bounding boxes: samples/python/plot_custom_bboxes.ipynb - - Example DICOM redaction evaluation: samples/python/example_dicom_redactor_evaluation.ipynb - - PDF: - - Annotating PII in a PDF: samples/python/example_pdf_annotation.ipynb - - Deployment: - - Presidio with App Service: samples/deployments/app-service/index.md - - Presidio with Kubernetes: samples/deployments/k8s/index.md - - Presidio with Spark: samples/deployments/spark/index.md - - Azure Data Factory: - - ETL using AppService/Databricks: samples/deployments/data-factory/presidio-data-factory.md - - Add Presidio as an HTTP service to your Azure Data Factory: samples/deployments/data-factory/presidio-data-factory-template-gallery-http.md - - Add Presidio on Databricks to your Azure Data Factory: samples/deployments/data-factory/presidio-data-factory-template-gallery-databricks.md - - PII Masking LLM calls using LiteLLM proxy: samples/docker/litellm.md - - Demo: - - Create a simple demo app using Streamlit: samples/python/streamlit/index.md + + - Home: samples/index.md + - Text: + - Presidio Basic Usage Notebook: samples/python/presidio_notebook.ipynb + - Customizing Presidio Analyzer: samples/python/customizing_presidio_analyzer.ipynb + - Configuring The NLP engine: samples/python/ner_model_configuration.ipynb + - Encrypting and Decrypting identified entities: samples/python/encrypt_decrypt.ipynb + - Getting the identified entity value using a custom Operator: samples/python/getting_entity_values.ipynb + - Anonymizing known values: samples/python/Anonymizing known values.ipynb + - Keeping some entities from being anonymized: samples/python/keep_entities.ipynb + - Integrating with external services: samples/python/integrating_with_external_services.ipynb + - Remote Recognizer: https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_remote_recognizer.py + - Azure AI Language as a Remote Recognizer: samples/python/text_analytics/index.md + - Using Flair as an external PII model: https://github.com/microsoft/presidio/blob/main/docs/samples/python/flair_recognizer.py + - Using Span Marker as an external PII model: https://github.com/microsoft/presidio/blob/main/docs/samples/python/span_marker_recognizer.py + - Using Transformers as an external PII model: samples/python/transformers_recognizer/index.md + - Using GLiNER as an external PII model: samples/python/gliner.md + - Pseudonymization (replace PII values using mappings): samples/python/pseudonymization.ipynb + - Passing a lambda as a Presidio anonymizer using Faker: https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_custom_lambda_anonymizer.py + - Synthetic data generation with OpenAI: samples/python/synth_data_with_openai.ipynb + - YAML based no-code configuration: samples/python/no_code_config.ipynb + - Data: + - Analyzing structured / semi-structured data in batch: samples/python/batch_processing.ipynb + - Presidio Structured Basic Usage Notebook: samples/python/example_structured.ipynb + - Analyze and Anonymize CSV file: https://github.com/microsoft/presidio/blob/main/docs/samples/python/process_csv_file.py + - Images: + - Redacting Text PII from DICOM images: samples/python/example_dicom_image_redactor.ipynb + - Using an allow list with image redaction: samples/python/image_redaction_allow_list_approach.ipynb + - Plot custom bounding boxes: samples/python/plot_custom_bboxes.ipynb + - Example DICOM redaction evaluation: samples/python/example_dicom_redactor_evaluation.ipynb + - PDF: + - Annotating PII in a PDF: samples/python/example_pdf_annotation.ipynb + - Deployment: + - Presidio with App Service: samples/deployments/app-service/index.md + - Presidio with Kubernetes: samples/deployments/k8s/index.md + - Presidio with Spark: samples/deployments/spark/index.md + - Azure Data Factory: + - ETL using AppService/Databricks: samples/deployments/data-factory/presidio-data-factory.md + - Add Presidio as an HTTP service to your Azure Data Factory: samples/deployments/data-factory/presidio-data-factory-template-gallery-http.md + - Add Presidio on Databricks to your Azure Data Factory: samples/deployments/data-factory/presidio-data-factory-template-gallery-databricks.md + - PII Masking LLM calls using LiteLLM proxy: samples/docker/litellm.md + - Demo app: + - Create a simple demo app using Streamlit: samples/python/streamlit/index.md not_in_nav : | design.md samples/deployments/index.md From cf9be8d50fe252196b3a32d37739f4a74005d477 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Thu, 2 Jan 2025 12:31:55 +0200 Subject: [PATCH 3/8] add exception if gliner is not installed --- .../predefined_recognizers/gliner_recognizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/gliner_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/gliner_recognizer.py index bd535c0f3..3bee43049 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/gliner_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/gliner_recognizer.py @@ -102,7 +102,8 @@ def __init__( def load(self) -> None: """Load the GLiNER model.""" - + if not GLiNER: + raise ImportError("GLiNER is not installed. Please install it.") self.gliner = GLiNER.from_pretrained(self.model_name) def analyze( From e290c68695e8cb0899e483ee2bc960c7bd365ee6 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 6 Jan 2025 08:08:37 +0200 Subject: [PATCH 4/8] Update pyproject.toml to support only cpu version for onnxruntime in python 3.9 --- presidio-analyzer/pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/presidio-analyzer/pyproject.toml b/presidio-analyzer/pyproject.toml index 0b800909c..60058946a 100644 --- a/presidio-analyzer/pyproject.toml +++ b/presidio-analyzer/pyproject.toml @@ -38,6 +38,8 @@ gliner = {version = ">=0.2.13,<1.0.0", optional = true} transformers = { version = "*", optional = true } huggingface_hub = { version = "*", optional = true } gunicorn = {version = "*", optional = true} +onnxruntime-gpu = { version = "^1.19.1", python = ">=3.10", optional=true} +onnxruntime = { version = "^1.19.1", python = "<3.10", optional=true} [tool.poetry.extras] server = ["flask", "gunicorn"] From 779a8b1686bde7fc03650d072e5e944025b3d89f Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 6 Jan 2025 08:22:24 +0200 Subject: [PATCH 5/8] Update onnxruntime-gpu version constraints --- presidio-analyzer/pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/presidio-analyzer/pyproject.toml b/presidio-analyzer/pyproject.toml index 60058946a..3e773815f 100644 --- a/presidio-analyzer/pyproject.toml +++ b/presidio-analyzer/pyproject.toml @@ -39,7 +39,7 @@ transformers = { version = "*", optional = true } huggingface_hub = { version = "*", optional = true } gunicorn = {version = "*", optional = true} onnxruntime-gpu = { version = "^1.19.1", python = ">=3.10", optional=true} -onnxruntime = { version = "^1.19.1", python = "<3.10", optional=true} +onnxruntime-gpu = { version = ">=1.18,<=1.18.1", python = "<3.10", optional=true} [tool.poetry.extras] server = ["flask", "gunicorn"] @@ -58,6 +58,7 @@ azure-ai-language = [ gliner = [ "transformers", "huggingface_hub", + "onnxruntime-gpu" "gliner" ] From 2b4339f91b69890b3fe9bd49e3b357d63705c0d9 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 6 Jan 2025 08:43:20 +0200 Subject: [PATCH 6/8] different onnxruntime versions for different python versions --- presidio-analyzer/pyproject.toml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/presidio-analyzer/pyproject.toml b/presidio-analyzer/pyproject.toml index 3e773815f..ae409c1ef 100644 --- a/presidio-analyzer/pyproject.toml +++ b/presidio-analyzer/pyproject.toml @@ -38,8 +38,11 @@ gliner = {version = ">=0.2.13,<1.0.0", optional = true} transformers = { version = "*", optional = true } huggingface_hub = { version = "*", optional = true } gunicorn = {version = "*", optional = true} -onnxruntime-gpu = { version = "^1.19.1", python = ">=3.10", optional=true} -onnxruntime-gpu = { version = ">=1.18,<=1.18.1", python = "<3.10", optional=true} + +onnxruntime-gpu = [ + {version = ">=1.18,<1.19", markers = "python_version < '3.10'", optional = true }, + {version = ">=1.20", markers = "python_version >= '3.10'", optional = true } +] [tool.poetry.extras] server = ["flask", "gunicorn"] @@ -58,8 +61,8 @@ azure-ai-language = [ gliner = [ "transformers", "huggingface_hub", + "gliner", "onnxruntime-gpu" - "gliner" ] [tool.poetry.group.dev.dependencies] From b467105aed88ed1a622627d53474047fc28490d4 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 6 Jan 2025 08:43:49 +0200 Subject: [PATCH 7/8] different onnxruntime versions for different python versions --- presidio-analyzer/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-analyzer/pyproject.toml b/presidio-analyzer/pyproject.toml index ae409c1ef..dce54cc58 100644 --- a/presidio-analyzer/pyproject.toml +++ b/presidio-analyzer/pyproject.toml @@ -41,7 +41,7 @@ gunicorn = {version = "*", optional = true} onnxruntime-gpu = [ {version = ">=1.18,<1.19", markers = "python_version < '3.10'", optional = true }, - {version = ">=1.20", markers = "python_version >= '3.10'", optional = true } + {version = ">=1.19", markers = "python_version >= '3.10'", optional = true } ] [tool.poetry.extras] From 15a94cc9fdc788a8b0e7226957164d8cd5412aaa Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 6 Jan 2025 09:18:12 +0200 Subject: [PATCH 8/8] skip tests if gliner is not installed --- docs/samples/python/gliner.md | 5 ++++- presidio-analyzer/pyproject.toml | 5 ++--- presidio-analyzer/tests/test_gliner_recognizer.py | 15 +++++++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/docs/samples/python/gliner.md b/docs/samples/python/gliner.md index 3d92bdda5..7e8bfc99f 100644 --- a/docs/samples/python/gliner.md +++ b/docs/samples/python/gliner.md @@ -26,6 +26,9 @@ To use GLiNER with Presidio, you need to install the `presidio-analyzer` with th pip install 'presidio-analyzer[gliner]' ``` +!!! note + GLiNER only supports python 3.10 and above, while Presidio supports version 3.9 and above. + ### Example ```python @@ -73,4 +76,4 @@ results = analyzer_engine.analyze( ) print(results) -``` \ No newline at end of file +``` diff --git a/presidio-analyzer/pyproject.toml b/presidio-analyzer/pyproject.toml index dce54cc58..ab977f80f 100644 --- a/presidio-analyzer/pyproject.toml +++ b/presidio-analyzer/pyproject.toml @@ -34,14 +34,13 @@ stanza = { version = "*", optional = true } spacy_stanza = { version = "*", optional = true } azure-ai-textanalytics = { version = "*", optional = true } azure-core = { version = "*", optional = true } -gliner = {version = ">=0.2.13,<1.0.0", optional = true} +gliner = {version = ">=0.2.13,<1.0.0", markers = "python_version >= '3.10'", optional = true} transformers = { version = "*", optional = true } huggingface_hub = { version = "*", optional = true } gunicorn = {version = "*", optional = true} onnxruntime-gpu = [ - {version = ">=1.18,<1.19", markers = "python_version < '3.10'", optional = true }, - {version = ">=1.19", markers = "python_version >= '3.10'", optional = true } + {version = ">=1.19", markers = "python_version >= '3.10'", optional = true }, ] [tool.poetry.extras] diff --git a/presidio-analyzer/tests/test_gliner_recognizer.py b/presidio-analyzer/tests/test_gliner_recognizer.py index f5d8b0ace..b78f731f7 100644 --- a/presidio-analyzer/tests/test_gliner_recognizer.py +++ b/presidio-analyzer/tests/test_gliner_recognizer.py @@ -1,3 +1,5 @@ +import sys + import pytest from unittest.mock import MagicMock, patch @@ -9,6 +11,9 @@ def mock_gliner(): """ Fixture to mock GLiNER class and its methods. """ + + pytest.importorskip("gliner", reason="GLiNER package is not installed") + # Mock the GLiNER class and its methods mock_gliner_instance = MagicMock() # Mock the from_pretrained method to return the mock instance @@ -19,6 +24,10 @@ def mock_gliner(): def test_analyze_passed_entities_are_subset_of_entity_mapping( mock_gliner ): + + if sys.version_info < (3, 10): + pytest.skip("gliner requires Python >= 3.10") + # Mock GLiNER predict_entities mock_gliner.predict_entities.return_value = [ {"label": "person", "start": 11, "end": 19, "score": 0.95}, @@ -59,6 +68,12 @@ def test_analyze_passed_entities_are_subset_of_entity_mapping( def test_analyze_with_unsupported_entity(mock_gliner): + + + if sys.version_info < (3, 10): + pytest.skip("gliner requires Python >= 3.10") + + # Mock GLiNER predict_entities mock_gliner.gliner.predict_entities.return_value = [ {"label": "BIRD", "start": 0, "end": 5, "score": 0.75},