adding level for completness and add column level completness score

qalita-io · Nov 21, 2023 · 053ce71 · 053ce71
1 parent 00f011e
commit 053ce71
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 4 deletions.
diff --git a/profiling_pack/main.py b/profiling_pack/main.py
@@ -2,6 +2,7 @@
 Main file for pack
 """
 import os
+import re
 import glob
 import json
 import sys
@@ -11,7 +12,25 @@
 
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 
-
+# Function to extract percentage and determine level
+def determine_level(content):
+    """
+    Function to extract percentage and determine level
+    """
+    # Find percentage value in the string
+    match = re.search(r'(\d+(\.\d+)?)%', content)
+    if match:
+        percentage = float(match.group(1))
+        # Determine level based on percentage
+        if 0 <= percentage <= 70:
+            return 'info'
+        elif 71 <= percentage <= 90:
+            return 'warning'
+        elif 91 <= percentage <= 100:
+            return 'high'
+    return 'info'  # Default level if no percentage is found
+
+# Denormalize a dictionary with nested dictionaries
 def denormalize(data):
     """
     Denormalize a dictionary with nested dictionaries
@@ -61,6 +80,21 @@ def denormalize(data):
 
 ############################ Metrics
 
+# Calculate the completeness score for each column
+completeness_scores = []
+for col in df.columns:
+    non_null_count = df[col].notnull().sum()
+    total_count = len(df)
+    completeness_score = non_null_count / total_count
+    completeness_scores.append({
+        "key": "completeness_score",
+        "value": str(completeness_score),
+        "scope": {"perimeter": "column", "value": col}
+    })
+
+# Convert the completeness scores to DataFrame
+completeness_scores_df = pd.DataFrame(completeness_scores)
+
 # Load the JSON file
 print("Load report.json")
 with open("report.json", "r", encoding="utf-8") as file:
@@ -117,13 +151,15 @@ def denormalize(data):
 
 alerts = tables[2]
 alerts.columns = ["content", "type"]
-alerts["type"] = "info"
 # Set the scope perimeter as 'column'
 alerts["scope"] = alerts["content"].str.split().str[0]
 
 # Convert the scope to JSON
 alerts["scope"] = alerts["scope"].apply(lambda x: {"perimeter": "column", "value": x})
 
+# Apply the function to the 'content' column of the alerts DataFrame
+alerts['level'] = alerts['content'].apply(determine_level)
+
 ############################ Schemas
 
 # Initialize the list with the dataset name entry
@@ -149,7 +185,7 @@ def denormalize(data):
 variables_data_df = pd.DataFrame(variables_data)
 
 # Concatenate all the DataFrames
-metrics = pd.concat([general_data_df, variables_data_df, score], ignore_index=True)
+metrics = pd.concat([general_data_df, variables_data_df, score, completeness_scores_df], ignore_index=True)
 
 # Convert the DataFrames to JSON strings
 metrics_json = metrics.to_json(orient="records")

diff --git a/profiling_pack/properties.yaml b/profiling_pack/properties.yaml
@@ -3,5 +3,5 @@ icon: icon.png
 name: profiling
 type: completeness
 url: https://github.com/qalita-io/packs/tree/main/profiling_pack
-version: 1.0.7
+version: 1.0.8
 visibility: public