From bf7b5db9781a0a370809fd41b95ebd7d6cd251bf Mon Sep 17 00:00:00 2001 From: Nikita Volodin Date: Sun, 28 Jan 2024 15:51:28 -0500 Subject: [PATCH] feat(presidio): improve structured analysis in batchanalyze --- presidio/Pipfile.lock | 2 +- presidio/local-testing.http | 78 +++++++++++++++++++++++++++++++++++++ presidio/server/__main__.py | 59 ++++++---------------------- presidio/server/helpers.py | 74 +++++++++++++++++++++++++++++++++++ 4 files changed, 164 insertions(+), 49 deletions(-) create mode 100644 presidio/server/helpers.py diff --git a/presidio/Pipfile.lock b/presidio/Pipfile.lock index ae978282..c9b530af 100644 --- a/presidio/Pipfile.lock +++ b/presidio/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "8ef82890811447f8b70a311682754da920bad815e8075c7b3c2164c26319d10e" + "sha256": "95a30d8057a653a628e2bb360973aeb877be5ce06453308291326ab24dcf0b7d" }, "pipfile-spec": 6, "requires": {}, diff --git a/presidio/local-testing.http b/presidio/local-testing.http index 371c7fbf..ab499bb9 100644 --- a/presidio/local-testing.http +++ b/presidio/local-testing.http @@ -2,6 +2,84 @@ POST http://localhost:3000/batchanalyze Content-Type: application/json Accept: application/json +// expected result +//[ +// "PHONE_NUMBER", +// "URL", +// "EMAIL_ADDRESS", +// "LOCATION", +// "PERSON" +//] + +{ + "json_to_analyze": { + "key_a": { + "key_a1": "My phone number is 212-121-1424" + }, + "key_b": [ + "www.abc.com" + ], + "key_c": 3, + "names": [ + "James Bond", + "Clark Kent", + "Hakeem Olajuwon", + "No name here!" + ], + "users": [ + { + "id": 1, + "name": "John Doe", + "email": "john.doe@example.com", + "address": { + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + "postal_code": "12345" + } + }, + { + "id": 2, + "name": "Jane Smith", + "email": "jane.smith@example.com", + "address": { + "street": "456 Elm St", + "city": "Somewhere", + "state": "TX", + "postal_code": "67890" + } + }, + { + "id": 3, + "name": "Alice Johnson", + "email": "alice.johnson@example.com", + "address": { + "street": "789 Pine St", + "city": "Elsewhere", + "state": "NY", + "postal_code": "11223" + } + } + ] + } +} + +### + +POST http://localhost:3000/batchanalyze +Content-Type: application/json +Accept: application/json + +// expected output: +//[ +// "PHONE_NUMBER", +// "URL", +// "PERSON", +// "LOCATION", +// "DATE_TIME", +// "NRP" +//] + { "json_to_analyze": { "key_F": { diff --git a/presidio/server/__main__.py b/presidio/server/__main__.py index 3599ec61..0893a72e 100644 --- a/presidio/server/__main__.py +++ b/presidio/server/__main__.py @@ -2,19 +2,18 @@ import logging import os -import pprint -import re from logging.config import fileConfig from pathlib import Path -from typing import Tuple, List, Dict, Any, AnyStr +from typing import Tuple from flask import Flask, request, jsonify, Response -from werkzeug.exceptions import HTTPException - from presidio_analyzer.analyzer_engine import AnalyzerEngine -from presidio_analyzer.batch_analyzer_engine import BatchAnalyzerEngine from presidio_analyzer.analyzer_request import AnalyzerRequest +from presidio_analyzer.batch_analyzer_engine import BatchAnalyzerEngine from presidio_anonymizer import BatchAnonymizerEngine +from werkzeug.exceptions import HTTPException + +from helpers import convert_all_lists_to_dicts, extract_data_types_from_results DEFAULT_PORT = "3000" @@ -139,18 +138,17 @@ def batch_analyze() -> Tuple[Response, int]: # Note that this function implementation already adds the key as additional 'context' # for the decision (see batch_analyzer_engine.py line 96) recognizer_result_list = self.batch_analyzer.analyze_dict( - input_dict=request_obj["json_to_analyze"], language="en" + input_dict=convert_all_lists_to_dicts( + request_obj["json_to_analyze"] + ), + language="en", ) print(recognizer_result_list) - anonymizer_results = self.batch_anonymizer.anonymize_dict( + unique_pii_list = extract_data_types_from_results( recognizer_result_list ) - pprint.pprint(anonymizer_results) - class_substring_pattern = re.compile(r"<([^>]*)>") - unique_pii_list = recursive_find_pattern( - anonymizer_results, class_substring_pattern - ) + unique_valid_pii_list = [ pii for pii in unique_pii_list if pii in data_items_set ] @@ -172,41 +170,6 @@ def batch_analyze() -> Tuple[Response, int]: return jsonify(error=e.args[0]), 500 -def recursive_find_pattern( - d: Dict[AnyStr, Any], pattern: re.Pattern[AnyStr] -) -> List[AnyStr]: - def match_string(input_string: str): - pattern_found = pattern.search(input_string) - if pattern_found is None: - return - - first_match = pattern_found.group( - 1 - ) # group(1) gets matching data type within <> - if first_match is not None and first_match not in acc: - acc.append(first_match) - - def recursive_switch_case(v): - if isinstance(v, dict): - dict_find_pattern_in_value(v) - elif isinstance(v, list): - list_find_pattern_in_value(v) - elif isinstance(v, str): - match_string(v) - - def list_find_pattern_in_value(input_list: list): - for v in input_list: - recursive_switch_case(v) - - def dict_find_pattern_in_value(input_dict: dict): - for v in input_dict.values(): - recursive_switch_case(v) - - acc = [] - dict_find_pattern_in_value(d) - return acc - - data_items_set = [ "CREDIT_CARD", "NRP", diff --git a/presidio/server/helpers.py b/presidio/server/helpers.py new file mode 100644 index 00000000..49bf08a0 --- /dev/null +++ b/presidio/server/helpers.py @@ -0,0 +1,74 @@ +from typing import Any, Iterator, Set + +from presidio_analyzer import DictAnalyzerResult, RecognizerResult + + +def convert_all_lists_to_dicts(data: Any) -> Any: + """ + Recursively transforms all lists into dictionaries. + :param data: Incoming data + :return: Transformed data + """ + + if not isinstance(data, (list, dict)): + return data + + if isinstance(data, dict): + return {k: convert_all_lists_to_dicts(v) for k, v in data.items()} + + if isinstance(data, list): + result = dict() + + for k, v in enumerate(data): + # need to convert `int` indexes into strings, otherwise the + # for-loop in the subsequent operation fails. + result[str(k)] = convert_all_lists_to_dicts(v) + + return result + + raise TypeError("Unknown type of incoming data: " + str(type(data))) + + +def extract_data_types_from_results( + dict_results: Iterator[DictAnalyzerResult], +) -> Set[str]: + """ + Extract `entity_type` fields from all nested `RecognizerResult` types within + the incoming tree structure of `DictAnalyzerResult`. + :param dict_results: Result of running `batch_analyzer.analyze_dict` + function on JSON object + :return: Set of entity types + """ + + todo = list(dict_results) + final = set() + + while len(todo) > 0: + # The type of `recognizer_results` is defined here: + # https://github.com/microsoft/presidio/blob/3c7eb8909a3341f2597453fbcaba6184477aa464/presidio-analyzer/presidio_analyzer/dict_analyzer_result.py#L25 + recognizer_results = todo.pop().recognizer_results + + if isinstance(recognizer_results, list): + # In this case `recognizer_results` has type + # `Union[List[RecognizerResult], List[List[RecognizerResult]]]`. + # Once we reach value of type `RecognizerResult`, we can extract + # `entity_type` from it. + for item in recognizer_results: + if isinstance(item, RecognizerResult): + final.add(item.entity_type) + elif isinstance(item, list): + for rr in item: + if isinstance(rr, RecognizerResult): + final.add(rr.entity_type) + else: + raise TypeError("Unknown type of result: " + str(type(rr))) + else: + raise TypeError("Unknown type of result: " + str(type(item))) + + elif isinstance(recognizer_results, Iterator): + todo.extend(recognizer_results) + + else: + raise TypeError("Unknown type of result: " + str(type(recognizer_results))) + + return final