From bf7b5db9781a0a370809fd41b95ebd7d6cd251bf Mon Sep 17 00:00:00 2001
From: Nikita Volodin <volodin.n@gmail.com>
Date: Sun, 28 Jan 2024 15:51:28 -0500
Subject: [PATCH] feat(presidio): improve structured analysis in batchanalyze

---
 presidio/Pipfile.lock       |  2 +-
 presidio/local-testing.http | 78 +++++++++++++++++++++++++++++++++++++
 presidio/server/__main__.py | 59 ++++++----------------------
 presidio/server/helpers.py  | 74 +++++++++++++++++++++++++++++++++++
 4 files changed, 164 insertions(+), 49 deletions(-)
 create mode 100644 presidio/server/helpers.py

diff --git a/presidio/Pipfile.lock b/presidio/Pipfile.lock
index ae978282..c9b530af 100644
--- a/presidio/Pipfile.lock
+++ b/presidio/Pipfile.lock
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "8ef82890811447f8b70a311682754da920bad815e8075c7b3c2164c26319d10e"
+            "sha256": "95a30d8057a653a628e2bb360973aeb877be5ce06453308291326ab24dcf0b7d"
         },
         "pipfile-spec": 6,
         "requires": {},
diff --git a/presidio/local-testing.http b/presidio/local-testing.http
index 371c7fbf..ab499bb9 100644
--- a/presidio/local-testing.http
+++ b/presidio/local-testing.http
@@ -2,6 +2,84 @@ POST http://localhost:3000/batchanalyze
 Content-Type: application/json
 Accept: application/json
 
+// expected result
+//[
+//  "PHONE_NUMBER",
+//  "URL",
+//  "EMAIL_ADDRESS",
+//  "LOCATION",
+//  "PERSON"
+//]
+
+{
+  "json_to_analyze": {
+    "key_a": {
+      "key_a1": "My phone number is 212-121-1424"
+    },
+    "key_b": [
+      "www.abc.com"
+    ],
+    "key_c": 3,
+    "names": [
+      "James Bond",
+      "Clark Kent",
+      "Hakeem Olajuwon",
+      "No name here!"
+    ],
+    "users": [
+      {
+        "id": 1,
+        "name": "John Doe",
+        "email": "john.doe@example.com",
+        "address": {
+          "street": "123 Main St",
+          "city": "Anytown",
+          "state": "CA",
+          "postal_code": "12345"
+        }
+      },
+      {
+        "id": 2,
+        "name": "Jane Smith",
+        "email": "jane.smith@example.com",
+        "address": {
+          "street": "456 Elm St",
+          "city": "Somewhere",
+          "state": "TX",
+          "postal_code": "67890"
+        }
+      },
+      {
+        "id": 3,
+        "name": "Alice Johnson",
+        "email": "alice.johnson@example.com",
+        "address": {
+          "street": "789 Pine St",
+          "city": "Elsewhere",
+          "state": "NY",
+          "postal_code": "11223"
+        }
+      }
+    ]
+  }
+}
+
+###
+
+POST http://localhost:3000/batchanalyze
+Content-Type: application/json
+Accept: application/json
+
+// expected output:
+//[
+//  "PHONE_NUMBER",
+//  "URL",
+//  "PERSON",
+//  "LOCATION",
+//  "DATE_TIME",
+//  "NRP"
+//]
+
 {
   "json_to_analyze": {
     "key_F": {
diff --git a/presidio/server/__main__.py b/presidio/server/__main__.py
index 3599ec61..0893a72e 100644
--- a/presidio/server/__main__.py
+++ b/presidio/server/__main__.py
@@ -2,19 +2,18 @@
 
 import logging
 import os
-import pprint
-import re
 from logging.config import fileConfig
 from pathlib import Path
-from typing import Tuple, List, Dict, Any, AnyStr
+from typing import Tuple
 
 from flask import Flask, request, jsonify, Response
-from werkzeug.exceptions import HTTPException
-
 from presidio_analyzer.analyzer_engine import AnalyzerEngine
-from presidio_analyzer.batch_analyzer_engine import BatchAnalyzerEngine
 from presidio_analyzer.analyzer_request import AnalyzerRequest
+from presidio_analyzer.batch_analyzer_engine import BatchAnalyzerEngine
 from presidio_anonymizer import BatchAnonymizerEngine
+from werkzeug.exceptions import HTTPException
+
+from helpers import convert_all_lists_to_dicts, extract_data_types_from_results
 
 DEFAULT_PORT = "3000"
 
@@ -139,18 +138,17 @@ def batch_analyze() -> Tuple[Response, int]:
                 # Note that this function implementation already adds the key as additional 'context'
                 # for the decision (see batch_analyzer_engine.py line 96)
                 recognizer_result_list = self.batch_analyzer.analyze_dict(
-                    input_dict=request_obj["json_to_analyze"], language="en"
+                    input_dict=convert_all_lists_to_dicts(
+                        request_obj["json_to_analyze"]
+                    ),
+                    language="en",
                 )
                 print(recognizer_result_list)
 
-                anonymizer_results = self.batch_anonymizer.anonymize_dict(
+                unique_pii_list = extract_data_types_from_results(
                     recognizer_result_list
                 )
-                pprint.pprint(anonymizer_results)
-                class_substring_pattern = re.compile(r"<([^>]*)>")
-                unique_pii_list = recursive_find_pattern(
-                    anonymizer_results, class_substring_pattern
-                )
+
                 unique_valid_pii_list = [
                     pii for pii in unique_pii_list if pii in data_items_set
                 ]
@@ -172,41 +170,6 @@ def batch_analyze() -> Tuple[Response, int]:
                 return jsonify(error=e.args[0]), 500
 
 
-def recursive_find_pattern(
-    d: Dict[AnyStr, Any], pattern: re.Pattern[AnyStr]
-) -> List[AnyStr]:
-    def match_string(input_string: str):
-        pattern_found = pattern.search(input_string)
-        if pattern_found is None:
-            return
-
-        first_match = pattern_found.group(
-            1
-        )  # group(1) gets matching data type within <>
-        if first_match is not None and first_match not in acc:
-            acc.append(first_match)
-
-    def recursive_switch_case(v):
-        if isinstance(v, dict):
-            dict_find_pattern_in_value(v)
-        elif isinstance(v, list):
-            list_find_pattern_in_value(v)
-        elif isinstance(v, str):
-            match_string(v)
-
-    def list_find_pattern_in_value(input_list: list):
-        for v in input_list:
-            recursive_switch_case(v)
-
-    def dict_find_pattern_in_value(input_dict: dict):
-        for v in input_dict.values():
-            recursive_switch_case(v)
-
-    acc = []
-    dict_find_pattern_in_value(d)
-    return acc
-
-
 data_items_set = [
     "CREDIT_CARD",
     "NRP",
diff --git a/presidio/server/helpers.py b/presidio/server/helpers.py
new file mode 100644
index 00000000..49bf08a0
--- /dev/null
+++ b/presidio/server/helpers.py
@@ -0,0 +1,74 @@
+from typing import Any, Iterator, Set
+
+from presidio_analyzer import DictAnalyzerResult, RecognizerResult
+
+
+def convert_all_lists_to_dicts(data: Any) -> Any:
+    """
+    Recursively transforms all lists into dictionaries.
+    :param data: Incoming data
+    :return: Transformed data
+    """
+
+    if not isinstance(data, (list, dict)):
+        return data
+
+    if isinstance(data, dict):
+        return {k: convert_all_lists_to_dicts(v) for k, v in data.items()}
+
+    if isinstance(data, list):
+        result = dict()
+
+        for k, v in enumerate(data):
+            # need to convert `int` indexes into strings, otherwise the
+            # for-loop in the subsequent operation fails.
+            result[str(k)] = convert_all_lists_to_dicts(v)
+
+        return result
+
+    raise TypeError("Unknown type of incoming data: " + str(type(data)))
+
+
+def extract_data_types_from_results(
+    dict_results: Iterator[DictAnalyzerResult],
+) -> Set[str]:
+    """
+    Extract `entity_type` fields from all nested `RecognizerResult` types within
+    the incoming tree structure of `DictAnalyzerResult`.
+    :param dict_results: Result of running `batch_analyzer.analyze_dict`
+    function on JSON object
+    :return: Set of entity types
+    """
+
+    todo = list(dict_results)
+    final = set()
+
+    while len(todo) > 0:
+        # The type of `recognizer_results` is defined here:
+        # https://github.com/microsoft/presidio/blob/3c7eb8909a3341f2597453fbcaba6184477aa464/presidio-analyzer/presidio_analyzer/dict_analyzer_result.py#L25
+        recognizer_results = todo.pop().recognizer_results
+
+        if isinstance(recognizer_results, list):
+            # In this case `recognizer_results` has type
+            # `Union[List[RecognizerResult], List[List[RecognizerResult]]]`.
+            # Once we reach value of type `RecognizerResult`, we can extract
+            # `entity_type` from it.
+            for item in recognizer_results:
+                if isinstance(item, RecognizerResult):
+                    final.add(item.entity_type)
+                elif isinstance(item, list):
+                    for rr in item:
+                        if isinstance(rr, RecognizerResult):
+                            final.add(rr.entity_type)
+                        else:
+                            raise TypeError("Unknown type of result: " + str(type(rr)))
+                else:
+                    raise TypeError("Unknown type of result: " + str(type(item)))
+
+        elif isinstance(recognizer_results, Iterator):
+            todo.extend(recognizer_results)
+
+        else:
+            raise TypeError("Unknown type of result: " + str(type(recognizer_results)))
+
+    return final