Skip to content

Commit

Permalink
adding level for completness and add column level completness score
Browse files Browse the repository at this point in the history
  • Loading branch information
armandleopold committed Nov 21, 2023
1 parent 00f011e commit 053ce71
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 4 deletions.
42 changes: 39 additions & 3 deletions profiling_pack/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Main file for pack
"""
import os
import re
import glob
import json
import sys
Expand All @@ -11,7 +12,25 @@

warnings.filterwarnings("ignore", category=DeprecationWarning)


# Function to extract percentage and determine level
def determine_level(content):
"""
Function to extract percentage and determine level
"""
# Find percentage value in the string
match = re.search(r'(\d+(\.\d+)?)%', content)
if match:
percentage = float(match.group(1))
# Determine level based on percentage
if 0 <= percentage <= 70:
return 'info'
elif 71 <= percentage <= 90:
return 'warning'
elif 91 <= percentage <= 100:
return 'high'
return 'info' # Default level if no percentage is found

# Denormalize a dictionary with nested dictionaries
def denormalize(data):
"""
Denormalize a dictionary with nested dictionaries
Expand Down Expand Up @@ -61,6 +80,21 @@ def denormalize(data):

############################ Metrics

# Calculate the completeness score for each column
completeness_scores = []
for col in df.columns:
non_null_count = df[col].notnull().sum()
total_count = len(df)
completeness_score = non_null_count / total_count
completeness_scores.append({
"key": "completeness_score",
"value": str(completeness_score),
"scope": {"perimeter": "column", "value": col}
})

# Convert the completeness scores to DataFrame
completeness_scores_df = pd.DataFrame(completeness_scores)

# Load the JSON file
print("Load report.json")
with open("report.json", "r", encoding="utf-8") as file:
Expand Down Expand Up @@ -117,13 +151,15 @@ def denormalize(data):

alerts = tables[2]
alerts.columns = ["content", "type"]
alerts["type"] = "info"
# Set the scope perimeter as 'column'
alerts["scope"] = alerts["content"].str.split().str[0]

# Convert the scope to JSON
alerts["scope"] = alerts["scope"].apply(lambda x: {"perimeter": "column", "value": x})

# Apply the function to the 'content' column of the alerts DataFrame
alerts['level'] = alerts['content'].apply(determine_level)

############################ Schemas

# Initialize the list with the dataset name entry
Expand All @@ -149,7 +185,7 @@ def denormalize(data):
variables_data_df = pd.DataFrame(variables_data)

# Concatenate all the DataFrames
metrics = pd.concat([general_data_df, variables_data_df, score], ignore_index=True)
metrics = pd.concat([general_data_df, variables_data_df, score, completeness_scores_df], ignore_index=True)

# Convert the DataFrames to JSON strings
metrics_json = metrics.to_json(orient="records")
Expand Down
2 changes: 1 addition & 1 deletion profiling_pack/properties.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ icon: icon.png
name: profiling
type: completeness
url: https://github.com/qalita-io/packs/tree/main/profiling_pack
version: 1.0.7
version: 1.0.8
visibility: public

0 comments on commit 053ce71

Please sign in to comment.