Skip to content

Commit

Permalink
refactor and modularize context words logic (#828)
Browse files Browse the repository at this point in the history
  • Loading branch information
guybartal authored Feb 22, 2022
1 parent 81c8c49 commit a0c071b
Show file tree
Hide file tree
Showing 42 changed files with 752 additions and 409 deletions.
30 changes: 28 additions & 2 deletions docs/api-docs/api-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ paths:
"recognizer": "SpacyRecognizer", "pattern_name": null, "pattern": null, "original_score": 0.85,
"score": 0.85, "textual_explanation": "Identified as PERSON by Spacy's Named Entity Recognition",
"score_context_improvement": 0, "supportive_context_word": "", "validation_result": null
},
"recognition_metadata": {
"recognizer_name": "SpacyRecognizer"
}
},
{ "entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999,
Expand All @@ -63,6 +66,9 @@ paths:
"original_score": 0.3, "score": 0.6499999999999999, "textual_explanation": null,
"score_context_improvement": 0.3499999999999999, "supportive_context_word": "driver",
"validation_result": null
},
"recognition_metadata": {
"recognizer_name": "UsLicenseRecognizer"
}
}
]
Expand All @@ -74,7 +80,10 @@ paths:
"end": 38,
"entity_type": "US_DRIVER_LICENSE",
"score": 0.6499999999999999,
"start": 30
"start": 30,
"recognition_metadata": {
"recognizer_name": "UsLicenseRecognizer"
}
}
]

Expand Down Expand Up @@ -302,7 +311,7 @@ components:
"context": ["zip", "code"],
"supported_entity":"ZIP"
}
]
]
}

AnonymizeRequest:
Expand Down Expand Up @@ -397,6 +406,13 @@ components:
description: "list of recognizers to be used in the context of this request only (ad-hoc)."
items:
$ref: "#/components/schemas/PatternRecognizer"
context:
type: array
description: "list of context words which may help to raise recognized entities confidence"
items:
description: "The context word"
type: string
example: "address"

AnonymizeRequest:
type: object
Expand Down Expand Up @@ -474,6 +490,16 @@ components:
example: 0.8
entity_type:
$ref: "#/components/schemas/EntityTypes"
recognition_metadata:
type: object
$ref: "#/components/schemas/RecognizedMetadata"

RecognizedMetadata:
type: object
properties:
recognizer_name:
type: string
description: "Name of recognizer that made the decision"

RecognizerResultWithAnaysisExplanation:
allOf:
Expand Down
95 changes: 76 additions & 19 deletions e2e-tests/tests/test_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def test_given_a_correct_analyze_input_then_return_full_response():

expected_response = """
[
{"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation":null},
{"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, "analysis_explanation":null}
{"entity_type": "PERSON", "start": 0, "end": 10, "recognition_metadata": {"recognizer_name":"SpacyRecognizer"}, "score": 0.85, "analysis_explanation":null},
{"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "recognition_metadata": {"recognizer_name":"UsLicenseRecognizer"}, "score": 0.6499999999999999, "analysis_explanation":null}
]
"""
assert response_status == 200
Expand All @@ -38,7 +38,7 @@ def test_given_analyze_threshold_input_then_return_result_above_threshold():

expected_response = """
[
{"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85,
{"entity_type": "PERSON", "start": 0, "end": 10, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}, "score": 0.85,
"analysis_explanation": null
}
]
Expand Down Expand Up @@ -163,8 +163,8 @@ def test_given_return_decision_process_false_for_analyze_input_then_return_respo

expected_response = """
[
{"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation": null},
{"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, "analysis_explanation": null}
{"entity_type": "PERSON", "start": 0, "end": 10, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}, "score": 0.85, "analysis_explanation": null},
{"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "recognition_metadata": {"recognizer_name": "UsLicenseRecognizer"}, "score": 0.6499999999999999, "analysis_explanation": null}
]
"""
assert response_status == 200
Expand All @@ -175,26 +175,26 @@ def test_given_return_decision_process_false_for_analyze_input_then_return_respo
def test_given_decision_process_enabled_for_analyze_input_then_return_response_with_decision_process():
request_body = """
{
"text": "John Smith drivers license is AC432223", "language": "en", "return_decision_process": true
"text": "John Smith license is AC432223", "language": "en", "return_decision_process": true
}
"""
response_status, response_content = analyze(request_body)

expected_response = """
[
{"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85,
{"entity_type": "PERSON", "start": 0, "end": 10, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"},"score": 0.85,
"analysis_explanation": {
"recognizer": "SpacyRecognizer", "pattern_name": null, "pattern": null, "original_score": 0.85, "score": 0.85,
"textual_explanation": "Identified as PERSON by Spacy's Named Entity Recognition",
"score_context_improvement": 0, "supportive_context_word": "", "validation_result": null
}
},
{"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999,
{"entity_type": "US_DRIVER_LICENSE", "start": 22, "end": 30, "recognition_metadata": {"recognizer_name": "UsLicenseRecognizer"}, "score": 0.6499999999999999,
"analysis_explanation": {
"recognizer": "UsLicenseRecognizer", "pattern_name": "Driver License - Alphanumeric (weak)",
"pattern": "\\\\b([A-Z][0-9]{3,6}|[A-Z][0-9]{5,9}|[A-Z][0-9]{6,8}|[A-Z][0-9]{4,8}|[A-Z][0-9]{9,11}|[A-Z]{1,2}[0-9]{5,6}|H[0-9]{8}|V[0-9]{6}|X[0-9]{8}|A-Z]{2}[0-9]{2,5}|[A-Z]{2}[0-9]{3,7}|[0-9]{2}[A-Z]{3}[0-9]{5,6}|[A-Z][0-9]{13,14}|[A-Z][0-9]{18}|[A-Z][0-9]{6}R|[A-Z][0-9]{9}|[A-Z][0-9]{1,12}|[0-9]{9}[A-Z]|[A-Z]{2}[0-9]{6}[A-Z]|[0-9]{8}[A-Z]{2}|[0-9]{3}[A-Z]{2}[0-9]{4}|[A-Z][0-9][A-Z][0-9][A-Z]|[0-9]{7,8}[A-Z])\\\\b",
"original_score": 0.3, "score": 0.6499999999999999, "textual_explanation": null,
"score_context_improvement": 0.3499999999999999, "supportive_context_word": "driver", "validation_result": null
"score_context_improvement": 0.3499999999999999, "supportive_context_word": "license", "validation_result": null
}
}
]
Expand All @@ -203,6 +203,63 @@ def test_given_decision_process_enabled_for_analyze_input_then_return_response_w
assert equal_json_strings(expected_response, response_content)


@pytest.mark.api
def test_given_decision_process_enabled_for_analyze_input_with_aditional_context_then_return_response_with_decision_process_and_correct_supportive_context_word():
request_body = """
{
"text": "John Smith D.L. is AC432223", "language": "en", "return_decision_process": true, "context": ["Driver license"]
}
"""
response_status, response_content = analyze(request_body)

expected_response = """
[
{
"analysis_explanation": {
"original_score": 0.85,
"pattern": null,
"pattern_name": null,
"recognizer": "SpacyRecognizer",
"score": 0.85,
"score_context_improvement": 0,
"supportive_context_word": "",
"textual_explanation": "Identified as PERSON by Spacy\'s Named Entity Recognition",
"validation_result": null
},
"end": 15,
"entity_type": "PERSON",
"recognition_metadata": {
"recognizer_name": "SpacyRecognizer"
},
"score": 0.85,
"start": 0
},
{
"analysis_explanation": {
"original_score": 0.3,
"pattern": "\\\\b([A-Z][0-9]{3,6}|[A-Z][0-9]{5,9}|[A-Z][0-9]{6,8}|[A-Z][0-9]{4,8}|[A-Z][0-9]{9,11}|[A-Z]{1,2}[0-9]{5,6}|H[0-9]{8}|V[0-9]{6}|X[0-9]{8}|A-Z]{2}[0-9]{2,5}|[A-Z]{2}[0-9]{3,7}|[0-9]{2}[A-Z]{3}[0-9]{5,6}|[A-Z][0-9]{13,14}|[A-Z][0-9]{18}|[A-Z][0-9]{6}R|[A-Z][0-9]{9}|[A-Z][0-9]{1,12}|[0-9]{9}[A-Z]|[A-Z]{2}[0-9]{6}[A-Z]|[0-9]{8}[A-Z]{2}|[0-9]{3}[A-Z]{2}[0-9]{4}|[A-Z][0-9][A-Z][0-9][A-Z]|[0-9]{7,8}[A-Z])\\\\b",
"pattern_name": "Driver License - Alphanumeric (weak)",
"recognizer": "UsLicenseRecognizer",
"score": 0.6499999999999999,
"score_context_improvement": 0.3499999999999999,
"supportive_context_word": "driver",
"textual_explanation": null,
"validation_result": null
},
"end": 27,
"entity_type": "US_DRIVER_LICENSE",
"recognition_metadata": {
"recognizer_name": "UsLicenseRecognizer"
},
"score": 0.6499999999999999,
"start": 19
}
]
"""
assert response_status == 200
assert equal_json_strings(expected_response, response_content)


def test_given_analyze_entities_input_then_return_results_only_with_those_entities():
request_body = """
{
Expand All @@ -214,7 +271,7 @@ def test_given_analyze_entities_input_then_return_results_only_with_those_entiti

expected_response = """
[
{"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85,
{"entity_type": "PERSON", "start": 0, "end": 10, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}, "score": 0.85,
"analysis_explanation": null
}
]
Expand Down Expand Up @@ -300,9 +357,9 @@ def test_given_ad_hoc_pattern_recognizer_the_right_entities_are_returned():

expected_response = """
[
{"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation":null},
{"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, "analysis_explanation":null},
{"entity_type": "ZIP", "start": 50, "end": 55, "score": 0.01, "analysis_explanation":null}
{"entity_type": "PERSON", "start": 0, "end": 10, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}, "score": 0.85, "analysis_explanation":null},
{"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "recognition_metadata": {"recognizer_name": "UsLicenseRecognizer"}, "score": 0.6499999999999999, "analysis_explanation":null},
{"entity_type": "ZIP", "start": 50, "end": 55, "recognition_metadata": {"recognizer_name": "Zip code Recognizer"}, "score": 0.01, "analysis_explanation":null}
]
"""
assert response_status == 200
Expand Down Expand Up @@ -369,9 +426,9 @@ def test_given_ad_hoc_pattern_recognizer_context_raises_confidence():

expected_response = """
[
{"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation":null},
{"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999, "analysis_explanation":null},
{"entity_type": "ZIP", "start": 50, "end": 55, "score": 0.4, "analysis_explanation":null}
{"entity_type": "PERSON", "start": 0, "end": 10, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}, "score": 0.85, "analysis_explanation":null},
{"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "recognition_metadata": {"recognizer_name": "UsLicenseRecognizer"}, "score": 0.6499999999999999, "analysis_explanation":null},
{"entity_type": "ZIP", "start": 50, "end": 55, "recognition_metadata": {"recognizer_name": "Zip code Recognizer"}, "score": 0.4, "analysis_explanation":null}
]
"""
assert response_status == 200
Expand Down Expand Up @@ -405,9 +462,9 @@ def test_given_ad_hoc_deny_list_recognizer_the_right_entities_are_returned():

expected_response = """
[
{"entity_type": "PERSON", "start": 4, "end": 14, "score": 0.85, "analysis_explanation":null},
{"entity_type": "US_DRIVER_LICENSE", "start": 36, "end": 44, "score": 0.6499999999999999, "analysis_explanation":null},
{"entity_type": "MR_TITLE", "start": 0, "end": 3, "score": 1.0, "analysis_explanation":null}
{"entity_type": "PERSON", "start": 4, "end": 14, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}, "score": 0.85, "analysis_explanation":null},
{"entity_type": "US_DRIVER_LICENSE", "start": 36, "end": 44, "recognition_metadata": {"recognizer_name": "UsLicenseRecognizer"}, "score": 0.6499999999999999, "analysis_explanation":null},
{"entity_type": "MR_TITLE", "start": 0, "end": 3, "recognition_metadata": {"recognizer_name": "Mr. Recognizer"}, "score": 1.0, "analysis_explanation":null}
]
"""
assert response_status == 200
Expand Down
12 changes: 6 additions & 6 deletions e2e-tests/tests/test_e2e_integration_flows.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ def test_given_text_with_pii_then_analyze_and_anonymize_successfully():
expected_response = """
[
{"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85,
"analysis_explanation": null
"analysis_explanation": null, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}
},
{"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999,
"analysis_explanation": null
"analysis_explanation": null, "recognition_metadata": {"recognizer_name": "UsLicenseRecognizer"}
}
]
"""
Expand Down Expand Up @@ -79,7 +79,7 @@ def test_given_a_correct_analyze_input_high_threashold_then_anonymize_partially(

expected_response = """
[
{"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation": null}
{"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation": null, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}}
]
"""

Expand Down Expand Up @@ -125,7 +125,7 @@ def test_given_a_correct_analyze_input_with_high_threshold_and_unmatched_entitie

expected_response = """
[
{"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation": null}
{"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85, "analysis_explanation": null, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}}
]
"""

Expand Down Expand Up @@ -165,10 +165,10 @@ def test_given_an_unknown_entity_then_anonymize_uses_defaults():
expected_response = """
[
{"entity_type": "PERSON", "start": 0, "end": 10, "score": 0.85,
"analysis_explanation": null
"analysis_explanation": null, "recognition_metadata": {"recognizer_name": "SpacyRecognizer"}
},
{"entity_type": "US_DRIVER_LICENSE", "start": 30, "end": 38, "score": 0.6499999999999999,
"analysis_explanation": null
"analysis_explanation": null, "recognition_metadata": {"recognizer_name": "UsLicenseRecognizer"}
}
]
"""
Expand Down
1 change: 1 addition & 0 deletions presidio-analyzer/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def analyze() -> Tuple[str, int]:
entities=req_data.entities,
return_decision_process=req_data.return_decision_process,
ad_hoc_recognizers=req_data.ad_hoc_recognizers,
context=req_data.context,
)

return Response(
Expand Down
4 changes: 4 additions & 0 deletions presidio-analyzer/presidio_analyzer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from presidio_analyzer.recognizer_registry import RecognizerRegistry
from presidio_analyzer.analyzer_engine import AnalyzerEngine
from presidio_analyzer.analyzer_request import AnalyzerRequest
from presidio_analyzer.context_aware_enhancers import ContextAwareEnhancer
from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer

# Define default loggers behavior

Expand Down Expand Up @@ -40,4 +42,6 @@
"RecognizerRegistry",
"AnalyzerEngine",
"AnalyzerRequest",
"ContextAwareEnhancer",
"LemmaContextAwareEnhancer",
]
30 changes: 30 additions & 0 deletions presidio-analyzer/presidio_analyzer/analyzer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
)
from presidio_analyzer.app_tracer import AppTracer
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
from presidio_analyzer.context_aware_enhancers import (
ContextAwareEnhancer,
LemmaContextAwareEnhancer,
)

logger = logging.getLogger("presidio-analyzer")

Expand All @@ -30,6 +34,9 @@ class AnalyzerEngine:
for detected entities to be returned
:param supported_languages: List of possible languages this engine could be run on.
Used for loading the right NLP models and recognizers for these languages.
:param context_aware_enhancer: instance of type ContextAwareEnhancer for enhancing
confidence score based on context words, (LemmaContextAwareEnhancer will be created
by default if None passed)
"""

def __init__(
Expand All @@ -40,6 +47,7 @@ def __init__(
log_decision_process: bool = False,
default_score_threshold: float = 0,
supported_languages: List[str] = None,
context_aware_enhancer: Optional[ContextAwareEnhancer] = None,
):
if not supported_languages:
supported_languages = ["en"]
Expand Down Expand Up @@ -70,6 +78,15 @@ def __init__(
self.log_decision_process = log_decision_process
self.default_score_threshold = default_score_threshold

if not context_aware_enhancer:
logger.debug(
"context aware enhancer not provided, creating default"
+ " lemma based enhancer."
)
context_aware_enhancer = LemmaContextAwareEnhancer()

self.context_aware_enhancer = context_aware_enhancer

def get_recognizers(self, language: Optional[str] = None) -> List[EntityRecognizer]:
"""
Return a list of PII recognizers currently loaded.
Expand Down Expand Up @@ -114,6 +131,7 @@ def analyze(
score_threshold: Optional[float] = None,
return_decision_process: Optional[bool] = False,
ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None,
context: Optional[List[str]] = None,
) -> List[RecognizerResult]:
"""
Find PII entities in text using different PII recognizers for a given language.
Expand All @@ -129,6 +147,8 @@ def analyze(
returned in the response.
:param ad_hoc_recognizers: List of recognizers which will be used only
for this specific request.
:param context: List of context words to enhance confidence score if matched
with the recognized entity's recognizer context
:return: an array of the found entities in the text
:example:
Expand Down Expand Up @@ -187,6 +207,16 @@ def analyze(
json.dumps([str(result.to_dict()) for result in results]),
)

# Update results in case surrounding words or external context are relevant to
# the context words.
results = self.context_aware_enhancer.enhance_using_context(
text=text,
raw_results=results,
nlp_artifacts=nlp_artifacts,
recognizers=recognizers,
context=context,
)

# Remove duplicates or low score results
results = EntityRecognizer.remove_duplicates(results)
results = self.__remove_low_scores(results, score_threshold)
Expand Down
1 change: 1 addition & 0 deletions presidio-analyzer/presidio_analyzer/analyzer_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ def __init__(self, req_data: Dict):
self.ad_hoc_recognizers = [
PatternRecognizer.from_dict(rec) for rec in ad_hoc_recognizers
]
self.context = req_data.get("context")
Loading

0 comments on commit a0c071b

Please sign in to comment.