Skip to content

Commit

Permalink
metrics rounded to 2 decimals , add sqlalchemy + edge case
Browse files Browse the repository at this point in the history
  • Loading branch information
armandleopold committed Dec 11, 2023
1 parent 45bc0cf commit aa94d9a
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 46 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
**/__pycache__/
poetry.lock
poetry.lock
profiling_pack/source_conf.json
24 changes: 18 additions & 6 deletions profiling_pack/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,18 @@

warnings.filterwarnings("ignore", category=DeprecationWarning)

def round_if_numeric(value, decimals=2):
try:
# Convert to a float and round
rounded_value = round(float(value), decimals)
# If the rounded value is an integer, convert it to an int
if rounded_value.is_integer():
return str(int(rounded_value))
# Otherwise, format it as a string with two decimal places
return "{:.2f}".format(rounded_value)
except (ValueError, TypeError):
# Return the original value if it's not a number
return str(value)

# Function to extract percentage and determine level
def determine_level(content):
Expand Down Expand Up @@ -67,7 +79,7 @@ def denormalize(data):
for col in df.columns:
non_null_count = df[col].notnull().sum()
total_count = len(df)
completeness_score = non_null_count / total_count
completeness_score = round(non_null_count / total_count, 2)
completeness_scores.append(
{
"key": "completeness_score",
Expand All @@ -89,7 +101,7 @@ def denormalize(data):
for key, value in general_data.items():
entry = {
"key": key,
"value": str(value),
"value": round_if_numeric(value),
"scope": {"perimeter": "dataset", "value": config["name"]},
}
new_format_data.append(entry)
Expand All @@ -101,7 +113,7 @@ def denormalize(data):
for attr_name, attr_value in attributes.items():
entry = {
"key": attr_name,
"value": str(attr_value),
"value": round_if_numeric(attr_value),
"scope": {"perimeter": "column", "value": variable_name},
}
new_format_data.append(entry)
Expand All @@ -122,10 +134,10 @@ def denormalize(data):
score = pd.DataFrame(
{
"key": "score",
"value": str(
"value": str(round(
(int(number_of_observations) - int(missing_cells))
/ int(number_of_observations)
),
/ int(number_of_observations), 2
)),
"scope": {"perimeter": "dataset", "value": config["name"]},
},
index=[0],
Expand Down
7 changes: 5 additions & 2 deletions profiling_pack/opener.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# opener.py
"""
The opener module contains functions to load data from files and databases.
"""

import os
import glob
import pandas as pd
Expand Down Expand Up @@ -91,7 +94,7 @@ def load_data(config):
return load_data_file(first_data_file)
else:
raise FileNotFoundError(
f"The path {path} is neither a file nor a directory."
f"The path {path} is neither a file nor a directory. Or can't be reached."
)

elif source_type == "database":
Expand Down
87 changes: 51 additions & 36 deletions profiling_pack/pack_conf.json
Original file line number Diff line number Diff line change
@@ -1,38 +1,53 @@
{
"metrics_chart_mapping": [
{
"metric_key":"n_cells_missing",
"chart_type":"area_chart"
},
{
"metric_key":"n",
"chart_type":"area_chart"
},
{
"metric_key":"n_cells_missing",
"chart_type":"text_header"
},
{
"metric_key":"p_cells_missing",
"chart_type":"text_header"
}
],
"schemas_chart_mapping": [
{
"metric_key":"n_missing",
"chart_type":"area_chart"
},
{
"metric_key":"min",
"chart_type":"text_header"
},
{
"metric_key":"max",
"chart_type":"text_header"
},
{
"metric_key":"skewness",
"chart_type":"text_header"
}
]
"charts": {
"overview": [
{
"metric_key": "n_cells_missing",
"chart_type": "area_chart",
"display_title": true
},
{
"metric_key": "n",
"chart_type": "area_chart",
"display_title": true
},
{
"metric_key": "n_cells_missing",
"chart_type": "text",
"display_title": true,
"justify": true
},
{
"metric_key": "p_cells_missing",
"chart_type": "text",
"display_title": true,
"justify": true
},
{
"metric_key": "score",
"chart_type": "text",
"display_title": true,
"justify": true
}
],
"scoped": [
{
"metric_key": "type",
"chart_type": "badge",
"display_title": true,
"justify": true
},
{
"metric_key": "completeness_score",
"chart_type": "text",
"display_title": true,
"justify": true
},
{
"metric_key": "completeness_score",
"chart_type": "spark_area_chart",
"display_title": false
}
]
}
}
2 changes: 1 addition & 1 deletion profiling_pack/properties.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ icon: icon.png
name: profiling
type: completeness
url: https://github.com/qalita-io/packs/tree/main/profiling_pack
version: 1.0.15
version: 1.0.21
visibility: public
1 change: 1 addition & 0 deletions profiling_pack/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ matplotlib = "3.7.0"
lxml = "^4.9.3"
pandas = "2.0.3"
openpyxl = "^3.1.2"
sqlalchemy = "^2.0.23"

[build-system]
requires = ["poetry-core"]
Expand Down

0 comments on commit aa94d9a

Please sign in to comment.