refactor and modularize context words logic (#828)

microsoft · Feb 22, 2022 · a0c071b · a0c071b
1 parent 81c8c49
commit a0c071b
Show file tree

Hide file tree

Showing 42 changed files with 752 additions and 409 deletions.
diff --git a/docs/api-docs/api-docs.yml b/docs/api-docs/api-docs.yml
@@ -54,6 +54,9 @@ paths:
                           "recognizer": "SpacyRecognizer", "pattern_name": null, "pattern": null, "original_score": 0.85,
                           "score": 0.85, "textual_explanation": "Identified as PERSON by Spacy's Named Entity Recognition",
                           "score_context_improvement": 0, "supportive_context_word": "", "validation_result": null
+                        },
+                        "recognition_metadata": {
+                          "recognizer_name": "SpacyRecognizer"
                         }
                       },
                       { "entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999,
@@ -63,6 +66,9 @@ paths:
                           "original_score": 0.3, "score": 0.6499999999999999, "textual_explanation": null,
                           "score_context_improvement": 0.3499999999999999, "supportive_context_word": "driver",
                           "validation_result": null
+                        },
+                        "recognition_metadata": {
+                          "recognizer_name": "UsLicenseRecognizer"
                         }
                       }
                     ]
@@ -74,7 +80,10 @@ paths:
                         "end": 38,
                         "entity_type": "US_DRIVER_LICENSE",
                         "score": 0.6499999999999999,
-                        "start": 30
+                        "start": 30,
+                        "recognition_metadata": {
+                          "recognizer_name": "UsLicenseRecognizer"
+                        }
                       }
                     ]
 
@@ -302,7 +311,7 @@ components:
                     "context": ["zip", "code"],
                     "supported_entity":"ZIP"
                     }
-        ]
+                  ]
                 }
 
     AnonymizeRequest:
@@ -397,6 +406,13 @@ components:
           description: "list of recognizers to be used in the context of this request only (ad-hoc)."
           items:
             $ref: "#/components/schemas/PatternRecognizer"
+        context:
+          type: array
+          description: "list of context words which may help to raise recognized entities confidence"
+          items:
+            description: "The context word"
+            type: string
+            example: "address"
 
     AnonymizeRequest:
       type: object
@@ -474,6 +490,16 @@ components:
           example: 0.8
         entity_type:
           $ref: "#/components/schemas/EntityTypes"
+        recognition_metadata:
+          type: object
+          $ref: "#/components/schemas/RecognizedMetadata"
+
+    RecognizedMetadata:
+      type: object
+      properties:
+        recognizer_name:
+          type: string
+          description: "Name of recognizer that made the decision"
 
     RecognizerResultWithAnaysisExplanation:
       allOf:

diff --git a/e2e-tests/tests/test_analyzer.py b/e2e-tests/tests/test_analyzer.py
@@ -17,8 +17,8 @@ def test_given_a_correct_analyze_input_then_return_full_response():
 
     expected_response = """
     [
-        {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation":null},
-        {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, "analysis_explanation":null}
+        {"entity_type": "PERSON", "start": 0, "end": 10, "recognition_metadata": {"recognizer_name":"SpacyRecognizer"}, "score": 0.85, "analysis_explanation":null},
+        {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "recognition_metadata": {"recognizer_name":"UsLicenseRecognizer"}, "score": 0.6499999999999999, "analysis_explanation":null}
     ]
     """
     assert response_status == 200
@@ -38,7 +38,7 @@ def test_given_analyze_threshold_input_then_return_result_above_threshold():
 
     expected_response = """
     [
-        {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, 
+        {"entity_type": "PERSON", "start": 0, "end": 10, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}, "score": 0.85, 
         "analysis_explanation": null
         }
     ]
@@ -163,8 +163,8 @@ def test_given_return_decision_process_false_for_analyze_input_then_return_respo
 
     expected_response = """
     [
-        {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation": null},
-        {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, "analysis_explanation": null}
+        {"entity_type": "PERSON", "start": 0, "end": 10, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}, "score": 0.85, "analysis_explanation": null},
+        {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "recognition_metadata": {"recognizer_name": "UsLicenseRecognizer"}, "score": 0.6499999999999999, "analysis_explanation": null}
     ]
     """
     assert response_status == 200
@@ -175,26 +175,26 @@ def test_given_return_decision_process_false_for_analyze_input_then_return_respo
 def test_given_decision_process_enabled_for_analyze_input_then_return_response_with_decision_process():
     request_body = """
     {
-        "text": "John Smith drivers license is AC432223", "language": "en", "return_decision_process": true
+        "text": "John Smith license is AC432223", "language": "en", "return_decision_process": true
     }
     """
     response_status, response_content = analyze(request_body)
 
     expected_response = """
     [
-        {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, 
+        {"entity_type": "PERSON", "start": 0, "end": 10, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"},"score": 0.85, 
         "analysis_explanation": {
             "recognizer": "SpacyRecognizer", "pattern_name": null, "pattern": null, "original_score": 0.85, "score": 0.85, 
             "textual_explanation": "Identified as PERSON by Spacy's Named Entity Recognition", 
             "score_context_improvement": 0, "supportive_context_word": "", "validation_result": null 
             }
         },
-        {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, 
+        {"entity_type": "US_DRIVER_LICENSE", "start": 22, "end": 30, "recognition_metadata": {"recognizer_name": "UsLicenseRecognizer"}, "score": 0.6499999999999999, 
         "analysis_explanation": {
             "recognizer": "UsLicenseRecognizer", "pattern_name": "Driver License - Alphanumeric (weak)", 
             "pattern": "\\\\b([A-Z][0-9]{3,6}|[A-Z][0-9]{5,9}|[A-Z][0-9]{6,8}|[A-Z][0-9]{4,8}|[A-Z][0-9]{9,11}|[A-Z]{1,2}[0-9]{5,6}|H[0-9]{8}|V[0-9]{6}|X[0-9]{8}|A-Z]{2}[0-9]{2,5}|[A-Z]{2}[0-9]{3,7}|[0-9]{2}[A-Z]{3}[0-9]{5,6}|[A-Z][0-9]{13,14}|[A-Z][0-9]{18}|[A-Z][0-9]{6}R|[A-Z][0-9]{9}|[A-Z][0-9]{1,12}|[0-9]{9}[A-Z]|[A-Z]{2}[0-9]{6}[A-Z]|[0-9]{8}[A-Z]{2}|[0-9]{3}[A-Z]{2}[0-9]{4}|[A-Z][0-9][A-Z][0-9][A-Z]|[0-9]{7,8}[A-Z])\\\\b", 
             "original_score": 0.3, "score": 0.6499999999999999, "textual_explanation": null, 
-            "score_context_improvement": 0.3499999999999999, "supportive_context_word": "driver", "validation_result": null
+            "score_context_improvement": 0.3499999999999999, "supportive_context_word": "license", "validation_result": null
             }
         }
     ]
@@ -203,6 +203,63 @@ def test_given_decision_process_enabled_for_analyze_input_then_return_response_w
     assert equal_json_strings(expected_response, response_content)
 
 
+@pytest.mark.api
+def test_given_decision_process_enabled_for_analyze_input_with_aditional_context_then_return_response_with_decision_process_and_correct_supportive_context_word():
+    request_body = """
+    {
+        "text": "John Smith D.L. is AC432223", "language": "en", "return_decision_process": true, "context": ["Driver license"]
+    }
+    """
+    response_status, response_content = analyze(request_body)
+
+    expected_response = """
+    [
+        {
+            "analysis_explanation": {
+                "original_score": 0.85,
+                "pattern": null,
+                "pattern_name": null,
+                "recognizer": "SpacyRecognizer",
+                "score": 0.85,
+                "score_context_improvement": 0,
+                "supportive_context_word": "",
+                "textual_explanation": "Identified as PERSON by Spacy\'s Named Entity Recognition",
+                "validation_result": null
+            },
+            "end": 15,
+            "entity_type": "PERSON",
+            "recognition_metadata": {
+                "recognizer_name": "SpacyRecognizer"
+            },
+            "score": 0.85,
+            "start": 0
+        },
+        {
+            "analysis_explanation": {
+                "original_score": 0.3,
+                "pattern": "\\\\b([A-Z][0-9]{3,6}|[A-Z][0-9]{5,9}|[A-Z][0-9]{6,8}|[A-Z][0-9]{4,8}|[A-Z][0-9]{9,11}|[A-Z]{1,2}[0-9]{5,6}|H[0-9]{8}|V[0-9]{6}|X[0-9]{8}|A-Z]{2}[0-9]{2,5}|[A-Z]{2}[0-9]{3,7}|[0-9]{2}[A-Z]{3}[0-9]{5,6}|[A-Z][0-9]{13,14}|[A-Z][0-9]{18}|[A-Z][0-9]{6}R|[A-Z][0-9]{9}|[A-Z][0-9]{1,12}|[0-9]{9}[A-Z]|[A-Z]{2}[0-9]{6}[A-Z]|[0-9]{8}[A-Z]{2}|[0-9]{3}[A-Z]{2}[0-9]{4}|[A-Z][0-9][A-Z][0-9][A-Z]|[0-9]{7,8}[A-Z])\\\\b",
+                "pattern_name": "Driver License - Alphanumeric (weak)",
+                "recognizer": "UsLicenseRecognizer",
+                "score": 0.6499999999999999,
+                "score_context_improvement": 0.3499999999999999,
+                "supportive_context_word": "driver",
+                "textual_explanation": null,
+                "validation_result": null
+            },
+            "end": 27,
+            "entity_type": "US_DRIVER_LICENSE",
+            "recognition_metadata": {
+                "recognizer_name": "UsLicenseRecognizer"
+            },
+            "score": 0.6499999999999999,
+            "start": 19
+        }
+    ]
+    """
+    assert response_status == 200
+    assert equal_json_strings(expected_response, response_content)
+
+
 def test_given_analyze_entities_input_then_return_results_only_with_those_entities():
     request_body = """
     {
@@ -214,7 +271,7 @@ def test_given_analyze_entities_input_then_return_results_only_with_those_entiti
 
     expected_response = """
     [ 
-        {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, 
+        {"entity_type": "PERSON", "start": 0, "end": 10, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}, "score": 0.85, 
         "analysis_explanation": null
         }
     ]
@@ -300,9 +357,9 @@ def test_given_ad_hoc_pattern_recognizer_the_right_entities_are_returned():
 
     expected_response = """
      [
-         {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation":null},
-         {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, "analysis_explanation":null},
-         {"entity_type": "ZIP", "start": 50, "end": 55, "score": 0.01, "analysis_explanation":null}
+         {"entity_type": "PERSON", "start": 0, "end": 10, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}, "score": 0.85, "analysis_explanation":null},
+         {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "recognition_metadata": {"recognizer_name": "UsLicenseRecognizer"}, "score": 0.6499999999999999, "analysis_explanation":null},
+         {"entity_type": "ZIP", "start": 50, "end": 55, "recognition_metadata": {"recognizer_name": "Zip code Recognizer"}, "score": 0.01, "analysis_explanation":null}
      ]
      """
     assert response_status == 200
@@ -369,9 +426,9 @@ def test_given_ad_hoc_pattern_recognizer_context_raises_confidence():
 
     expected_response = """
      [
-         {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation":null},
-         {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, "analysis_explanation":null},
-         {"entity_type": "ZIP", "start": 50, "end": 55, "score": 0.4, "analysis_explanation":null}
+         {"entity_type": "PERSON", "start": 0, "end": 10, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}, "score": 0.85, "analysis_explanation":null},
+         {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "recognition_metadata": {"recognizer_name": "UsLicenseRecognizer"}, "score": 0.6499999999999999, "analysis_explanation":null},
+         {"entity_type": "ZIP", "start": 50, "end": 55, "recognition_metadata": {"recognizer_name": "Zip code Recognizer"}, "score": 0.4, "analysis_explanation":null}
      ]
      """
     assert response_status == 200
@@ -405,9 +462,9 @@ def test_given_ad_hoc_deny_list_recognizer_the_right_entities_are_returned():
 
     expected_response = """
      [
-         {"entity_type": "PERSON", "start": 4, "end": 14, "score": 0.85, "analysis_explanation":null},
-         {"entity_type": "US_DRIVER_LICENSE", "start": 36, "end": 44, "score": 0.6499999999999999, "analysis_explanation":null},
-         {"entity_type": "MR_TITLE", "start": 0, "end": 3, "score": 1.0, "analysis_explanation":null}
+         {"entity_type": "PERSON", "start": 4, "end": 14, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}, "score": 0.85, "analysis_explanation":null},
+         {"entity_type": "US_DRIVER_LICENSE", "start": 36, "end": 44, "recognition_metadata": {"recognizer_name": "UsLicenseRecognizer"}, "score": 0.6499999999999999, "analysis_explanation":null},
+         {"entity_type": "MR_TITLE", "start": 0, "end": 3, "recognition_metadata": {"recognizer_name": "Mr. Recognizer"}, "score": 1.0, "analysis_explanation":null}
      ]
      """
     assert response_status == 200

diff --git a/e2e-tests/tests/test_e2e_integration_flows.py b/e2e-tests/tests/test_e2e_integration_flows.py
@@ -35,10 +35,10 @@ def test_given_text_with_pii_then_analyze_and_anonymize_successfully():
     expected_response = """
     [
         {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, 
-        "analysis_explanation": null
+        "analysis_explanation": null, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}
         },
         {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, 
-        "analysis_explanation": null
+        "analysis_explanation": null, "recognition_metadata": {"recognizer_name": "UsLicenseRecognizer"}
         }
     ]
     """
@@ -79,7 +79,7 @@ def test_given_a_correct_analyze_input_high_threashold_then_anonymize_partially(
 
     expected_response = """
     [
-        {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation": null}
+        {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation": null, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}}
     ]
     """
 
@@ -125,7 +125,7 @@ def test_given_a_correct_analyze_input_with_high_threshold_and_unmatched_entitie
 
     expected_response = """
     [
-        {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation": null}
+        {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation": null, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}}
     ]
     """
 
@@ -165,10 +165,10 @@ def test_given_an_unknown_entity_then_anonymize_uses_defaults():
     expected_response = """
     [
         {"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, 
-        "analysis_explanation": null
+        "analysis_explanation": null, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}
         },
         {"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, 
-        "analysis_explanation": null
+        "analysis_explanation": null, "recognition_metadata": {"recognizer_name": "UsLicenseRecognizer"}
         }
     ]
     """

diff --git a/presidio-analyzer/app.py b/presidio-analyzer/app.py
@@ -65,6 +65,7 @@ def analyze() -> Tuple[str, int]:
                     entities=req_data.entities,
                     return_decision_process=req_data.return_decision_process,
                     ad_hoc_recognizers=req_data.ad_hoc_recognizers,
+                    context=req_data.context,
                 )
 
                 return Response(

diff --git a/presidio-analyzer/presidio_analyzer/__init__.py b/presidio-analyzer/presidio_analyzer/__init__.py
@@ -12,6 +12,8 @@
 from presidio_analyzer.recognizer_registry import RecognizerRegistry
 from presidio_analyzer.analyzer_engine import AnalyzerEngine
 from presidio_analyzer.analyzer_request import AnalyzerRequest
+from presidio_analyzer.context_aware_enhancers import ContextAwareEnhancer
+from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer
 
 # Define default loggers behavior
 
@@ -40,4 +42,6 @@
     "RecognizerRegistry",
     "AnalyzerEngine",
     "AnalyzerRequest",
+    "ContextAwareEnhancer",
+    "LemmaContextAwareEnhancer",
 ]
diff --git a/presidio-analyzer/presidio_analyzer/analyzer_engine.py b/presidio-analyzer/presidio_analyzer/analyzer_engine.py
@@ -9,6 +9,10 @@
 )
 from presidio_analyzer.app_tracer import AppTracer
 from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
+from presidio_analyzer.context_aware_enhancers import (
+    ContextAwareEnhancer,
+    LemmaContextAwareEnhancer,
+)
 
 logger = logging.getLogger("presidio-analyzer")
 
@@ -30,6 +34,9 @@ class AnalyzerEngine:
     for detected entities to be returned
     :param supported_languages: List of possible languages this engine could be run on.
     Used for loading the right NLP models and recognizers for these languages.
+    :param context_aware_enhancer: instance of type ContextAwareEnhancer for enhancing
+    confidence score based on context words, (LemmaContextAwareEnhancer will be created
+    by default if None passed)
     """
 
     def __init__(
@@ -40,6 +47,7 @@ def __init__(
         log_decision_process: bool = False,
         default_score_threshold: float = 0,
         supported_languages: List[str] = None,
+        context_aware_enhancer: Optional[ContextAwareEnhancer] = None,
     ):
         if not supported_languages:
             supported_languages = ["en"]
@@ -70,6 +78,15 @@ def __init__(
         self.log_decision_process = log_decision_process
         self.default_score_threshold = default_score_threshold
 
+        if not context_aware_enhancer:
+            logger.debug(
+                "context aware enhancer not provided, creating default"
+                + " lemma based enhancer."
+            )
+            context_aware_enhancer = LemmaContextAwareEnhancer()
+
+        self.context_aware_enhancer = context_aware_enhancer
+
     def get_recognizers(self, language: Optional[str] = None) -> List[EntityRecognizer]:
         """
         Return a list of PII recognizers currently loaded.
@@ -114,6 +131,7 @@ def analyze(
         score_threshold: Optional[float] = None,
         return_decision_process: Optional[bool] = False,
         ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None,
+        context: Optional[List[str]] = None,
     ) -> List[RecognizerResult]:
         """
         Find PII entities in text using different PII recognizers for a given language.
@@ -129,6 +147,8 @@ def analyze(
         returned in the response.
         :param ad_hoc_recognizers: List of recognizers which will be used only
         for this specific request.
+        :param context: List of context words to enhance confidence score if matched
+        with the recognized entity's recognizer context
         :return: an array of the found entities in the text
 
         :example:
@@ -187,6 +207,16 @@ def analyze(
                 json.dumps([str(result.to_dict()) for result in results]),
             )
 
+        # Update results in case surrounding words or external context are relevant to
+        # the context words.
+        results = self.context_aware_enhancer.enhance_using_context(
+            text=text,
+            raw_results=results,
+            nlp_artifacts=nlp_artifacts,
+            recognizers=recognizers,
+            context=context,
+        )
+
         # Remove duplicates or low score results
         results = EntityRecognizer.remove_duplicates(results)
         results = self.__remove_low_scores(results, score_threshold)

diff --git a/presidio-analyzer/presidio_analyzer/analyzer_request.py b/presidio-analyzer/presidio_analyzer/analyzer_request.py
@@ -33,3 +33,4 @@ def __init__(self, req_data: Dict):
             self.ad_hoc_recognizers = [
                 PatternRecognizer.from_dict(rec) for rec in ad_hoc_recognizers
             ]
+        self.context = req_data.get("context")