diff --git a/.gitignore b/.gitignore index 7c83eb5..d2bd1a8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ **/__pycache__/ -poetry.lock \ No newline at end of file +poetry.lock +profiling_pack/source_conf.json diff --git a/profiling_pack/main.py b/profiling_pack/main.py index 00a25f4..ac68b55 100644 --- a/profiling_pack/main.py +++ b/profiling_pack/main.py @@ -10,6 +10,18 @@ warnings.filterwarnings("ignore", category=DeprecationWarning) +def round_if_numeric(value, decimals=2): + try: + # Convert to a float and round + rounded_value = round(float(value), decimals) + # If the rounded value is an integer, convert it to an int + if rounded_value.is_integer(): + return str(int(rounded_value)) + # Otherwise, format it as a string with two decimal places + return "{:.2f}".format(rounded_value) + except (ValueError, TypeError): + # Return the original value if it's not a number + return str(value) # Function to extract percentage and determine level def determine_level(content): @@ -67,7 +79,7 @@ def denormalize(data): for col in df.columns: non_null_count = df[col].notnull().sum() total_count = len(df) - completeness_score = non_null_count / total_count + completeness_score = round(non_null_count / total_count, 2) completeness_scores.append( { "key": "completeness_score", @@ -89,7 +101,7 @@ def denormalize(data): for key, value in general_data.items(): entry = { "key": key, - "value": str(value), + "value": round_if_numeric(value), "scope": {"perimeter": "dataset", "value": config["name"]}, } new_format_data.append(entry) @@ -101,7 +113,7 @@ def denormalize(data): for attr_name, attr_value in attributes.items(): entry = { "key": attr_name, - "value": str(attr_value), + "value": round_if_numeric(attr_value), "scope": {"perimeter": "column", "value": variable_name}, } new_format_data.append(entry) @@ -122,10 +134,10 @@ def denormalize(data): score = pd.DataFrame( { "key": "score", - "value": str( + "value": str(round( (int(number_of_observations) - int(missing_cells)) - / int(number_of_observations) - ), + / int(number_of_observations), 2 + )), "scope": {"perimeter": "dataset", "value": config["name"]}, }, index=[0], diff --git a/profiling_pack/opener.py b/profiling_pack/opener.py index 068a450..9ca7dce 100644 --- a/profiling_pack/opener.py +++ b/profiling_pack/opener.py @@ -1,4 +1,7 @@ -# opener.py +""" +The opener module contains functions to load data from files and databases. +""" + import os import glob import pandas as pd @@ -91,7 +94,7 @@ def load_data(config): return load_data_file(first_data_file) else: raise FileNotFoundError( - f"The path {path} is neither a file nor a directory." + f"The path {path} is neither a file nor a directory. Or can't be reached." ) elif source_type == "database": diff --git a/profiling_pack/pack_conf.json b/profiling_pack/pack_conf.json index dc27328..2f117e1 100644 --- a/profiling_pack/pack_conf.json +++ b/profiling_pack/pack_conf.json @@ -1,38 +1,53 @@ { - "metrics_chart_mapping": [ - { - "metric_key":"n_cells_missing", - "chart_type":"area_chart" - }, - { - "metric_key":"n", - "chart_type":"area_chart" - }, - { - "metric_key":"n_cells_missing", - "chart_type":"text_header" - }, - { - "metric_key":"p_cells_missing", - "chart_type":"text_header" - } - ], - "schemas_chart_mapping": [ - { - "metric_key":"n_missing", - "chart_type":"area_chart" - }, - { - "metric_key":"min", - "chart_type":"text_header" - }, - { - "metric_key":"max", - "chart_type":"text_header" - }, - { - "metric_key":"skewness", - "chart_type":"text_header" - } - ] + "charts": { + "overview": [ + { + "metric_key": "n_cells_missing", + "chart_type": "area_chart", + "display_title": true + }, + { + "metric_key": "n", + "chart_type": "area_chart", + "display_title": true + }, + { + "metric_key": "n_cells_missing", + "chart_type": "text", + "display_title": true, + "justify": true + }, + { + "metric_key": "p_cells_missing", + "chart_type": "text", + "display_title": true, + "justify": true + }, + { + "metric_key": "score", + "chart_type": "text", + "display_title": true, + "justify": true + } + ], + "scoped": [ + { + "metric_key": "type", + "chart_type": "badge", + "display_title": true, + "justify": true + }, + { + "metric_key": "completeness_score", + "chart_type": "text", + "display_title": true, + "justify": true + }, + { + "metric_key": "completeness_score", + "chart_type": "spark_area_chart", + "display_title": false + } + ] + } } diff --git a/profiling_pack/properties.yaml b/profiling_pack/properties.yaml index 0833597..0d6ceea 100644 --- a/profiling_pack/properties.yaml +++ b/profiling_pack/properties.yaml @@ -3,5 +3,5 @@ icon: icon.png name: profiling type: completeness url: https://github.com/qalita-io/packs/tree/main/profiling_pack -version: 1.0.15 +version: 1.0.21 visibility: public diff --git a/profiling_pack/pyproject.toml b/profiling_pack/pyproject.toml index ef21d14..3bf0549 100644 --- a/profiling_pack/pyproject.toml +++ b/profiling_pack/pyproject.toml @@ -13,6 +13,7 @@ matplotlib = "3.7.0" lxml = "^4.9.3" pandas = "2.0.3" openpyxl = "^3.1.2" +sqlalchemy = "^2.0.23" [build-system] requires = ["poetry-core"]