Skip to content

Commit

Permalink
feat(presidio): improve structured analysis in batchanalyze
Browse files Browse the repository at this point in the history
  • Loading branch information
qlonik committed Jan 28, 2024
1 parent 7c60ce3 commit bf7b5db
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 49 deletions.
2 changes: 1 addition & 1 deletion presidio/Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

78 changes: 78 additions & 0 deletions presidio/local-testing.http
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,84 @@ POST http://localhost:3000/batchanalyze
Content-Type: application/json
Accept: application/json

// expected result
//[
// "PHONE_NUMBER",
// "URL",
// "EMAIL_ADDRESS",
// "LOCATION",
// "PERSON"
//]

{
"json_to_analyze": {
"key_a": {
"key_a1": "My phone number is 212-121-1424"
},
"key_b": [
"www.abc.com"
],
"key_c": 3,
"names": [
"James Bond",
"Clark Kent",
"Hakeem Olajuwon",
"No name here!"
],
"users": [
{
"id": 1,
"name": "John Doe",
"email": "[email protected]",
"address": {
"street": "123 Main St",
"city": "Anytown",
"state": "CA",
"postal_code": "12345"
}
},
{
"id": 2,
"name": "Jane Smith",
"email": "[email protected]",
"address": {
"street": "456 Elm St",
"city": "Somewhere",
"state": "TX",
"postal_code": "67890"
}
},
{
"id": 3,
"name": "Alice Johnson",
"email": "[email protected]",
"address": {
"street": "789 Pine St",
"city": "Elsewhere",
"state": "NY",
"postal_code": "11223"
}
}
]
}
}

###

POST http://localhost:3000/batchanalyze
Content-Type: application/json
Accept: application/json

// expected output:
//[
// "PHONE_NUMBER",
// "URL",
// "PERSON",
// "LOCATION",
// "DATE_TIME",
// "NRP"
//]

{
"json_to_analyze": {
"key_F": {
Expand Down
59 changes: 11 additions & 48 deletions presidio/server/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,18 @@

import logging
import os
import pprint
import re
from logging.config import fileConfig
from pathlib import Path
from typing import Tuple, List, Dict, Any, AnyStr
from typing import Tuple

from flask import Flask, request, jsonify, Response
from werkzeug.exceptions import HTTPException

from presidio_analyzer.analyzer_engine import AnalyzerEngine
from presidio_analyzer.batch_analyzer_engine import BatchAnalyzerEngine
from presidio_analyzer.analyzer_request import AnalyzerRequest
from presidio_analyzer.batch_analyzer_engine import BatchAnalyzerEngine
from presidio_anonymizer import BatchAnonymizerEngine
from werkzeug.exceptions import HTTPException

from helpers import convert_all_lists_to_dicts, extract_data_types_from_results

DEFAULT_PORT = "3000"

Expand Down Expand Up @@ -139,18 +138,17 @@ def batch_analyze() -> Tuple[Response, int]:
# Note that this function implementation already adds the key as additional 'context'
# for the decision (see batch_analyzer_engine.py line 96)
recognizer_result_list = self.batch_analyzer.analyze_dict(
input_dict=request_obj["json_to_analyze"], language="en"
input_dict=convert_all_lists_to_dicts(
request_obj["json_to_analyze"]
),
language="en",
)
print(recognizer_result_list)

anonymizer_results = self.batch_anonymizer.anonymize_dict(
unique_pii_list = extract_data_types_from_results(
recognizer_result_list
)
pprint.pprint(anonymizer_results)
class_substring_pattern = re.compile(r"<([^>]*)>")
unique_pii_list = recursive_find_pattern(
anonymizer_results, class_substring_pattern
)

unique_valid_pii_list = [
pii for pii in unique_pii_list if pii in data_items_set
]
Expand All @@ -172,41 +170,6 @@ def batch_analyze() -> Tuple[Response, int]:
return jsonify(error=e.args[0]), 500


def recursive_find_pattern(
d: Dict[AnyStr, Any], pattern: re.Pattern[AnyStr]
) -> List[AnyStr]:
def match_string(input_string: str):
pattern_found = pattern.search(input_string)
if pattern_found is None:
return

first_match = pattern_found.group(
1
) # group(1) gets matching data type within <>
if first_match is not None and first_match not in acc:
acc.append(first_match)

def recursive_switch_case(v):
if isinstance(v, dict):
dict_find_pattern_in_value(v)
elif isinstance(v, list):
list_find_pattern_in_value(v)
elif isinstance(v, str):
match_string(v)

def list_find_pattern_in_value(input_list: list):
for v in input_list:
recursive_switch_case(v)

def dict_find_pattern_in_value(input_dict: dict):
for v in input_dict.values():
recursive_switch_case(v)

acc = []
dict_find_pattern_in_value(d)
return acc


data_items_set = [
"CREDIT_CARD",
"NRP",
Expand Down
74 changes: 74 additions & 0 deletions presidio/server/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from typing import Any, Iterator, Set

from presidio_analyzer import DictAnalyzerResult, RecognizerResult


def convert_all_lists_to_dicts(data: Any) -> Any:
"""
Recursively transforms all lists into dictionaries.
:param data: Incoming data
:return: Transformed data
"""

if not isinstance(data, (list, dict)):
return data

if isinstance(data, dict):
return {k: convert_all_lists_to_dicts(v) for k, v in data.items()}

if isinstance(data, list):
result = dict()

for k, v in enumerate(data):
# need to convert `int` indexes into strings, otherwise the
# for-loop in the subsequent operation fails.
result[str(k)] = convert_all_lists_to_dicts(v)

return result

raise TypeError("Unknown type of incoming data: " + str(type(data)))


def extract_data_types_from_results(
dict_results: Iterator[DictAnalyzerResult],
) -> Set[str]:
"""
Extract `entity_type` fields from all nested `RecognizerResult` types within
the incoming tree structure of `DictAnalyzerResult`.
:param dict_results: Result of running `batch_analyzer.analyze_dict`
function on JSON object
:return: Set of entity types
"""

todo = list(dict_results)
final = set()

while len(todo) > 0:
# The type of `recognizer_results` is defined here:
# https://github.com/microsoft/presidio/blob/3c7eb8909a3341f2597453fbcaba6184477aa464/presidio-analyzer/presidio_analyzer/dict_analyzer_result.py#L25
recognizer_results = todo.pop().recognizer_results

if isinstance(recognizer_results, list):
# In this case `recognizer_results` has type
# `Union[List[RecognizerResult], List[List[RecognizerResult]]]`.
# Once we reach value of type `RecognizerResult`, we can extract
# `entity_type` from it.
for item in recognizer_results:
if isinstance(item, RecognizerResult):
final.add(item.entity_type)
elif isinstance(item, list):
for rr in item:
if isinstance(rr, RecognizerResult):
final.add(rr.entity_type)
else:
raise TypeError("Unknown type of result: " + str(type(rr)))
else:
raise TypeError("Unknown type of result: " + str(type(item)))

elif isinstance(recognizer_results, Iterator):
todo.extend(recognizer_results)

else:
raise TypeError("Unknown type of result: " + str(type(recognizer_results)))

return final

0 comments on commit bf7b5db

Please sign in to comment.