From d7c912749f39e8b10915ddcc3f8920b89d0fbd6e Mon Sep 17 00:00:00 2001 From: Armand LEOPOLD Date: Mon, 11 Mar 2024 16:11:39 +0100 Subject: [PATCH] Add Edge case compare data when no id columns & Outlier detection add outliers table --- data_compare_pack/main.py | 9 +++++--- data_compare_pack/pack_conf.json | 2 +- data_compare_pack/properties.yaml | 2 +- outlier_detection_pack/main.py | 30 +++++++++++++++++++++++++- outlier_detection_pack/pack_conf.json | 5 +++++ outlier_detection_pack/properties.yaml | 2 +- 6 files changed, 43 insertions(+), 7 deletions(-) diff --git a/data_compare_pack/main.py b/data_compare_pack/main.py index efadc3e..0659cea 100644 --- a/data_compare_pack/main.py +++ b/data_compare_pack/main.py @@ -10,13 +10,13 @@ # Checking if the columns exist in the DataFrames -compare_col_list = pack.pack_config["job"]["compare_col_list"] -id_columns = pack.pack_config["job"]["id_columns"] +compare_col_list = pack.pack_config["job"].get("compare_col_list", []) +id_columns = pack.pack_config["job"].get("id_columns", []) abs_tol = pack.pack_config["job"].get("abs_tol", 0.0001) rel_tol = pack.pack_config["job"].get("rel_tol", 0) # Create an intersection of source and target columns if compare_col_list is empty -if not compare_col_list: +if compare_col_list == []: compare_col_list = list( set(pack.df_source.columns).intersection(set(pack.df_target.columns)) ) @@ -37,6 +37,9 @@ # Combine compare_col_list and id_columns while removing duplicates combined_columns_list = list(dict.fromkeys(compare_col_list + id_columns)) +if len(id_columns) == 0 : + id_columns = compare_col_list + # Creating subsets for source and target data with no repeated columns df_source_subset = pack.df_source[combined_columns_list] df_target_subset = pack.df_target[combined_columns_list] diff --git a/data_compare_pack/pack_conf.json b/data_compare_pack/pack_conf.json index cb91810..6637385 100644 --- a/data_compare_pack/pack_conf.json +++ b/data_compare_pack/pack_conf.json @@ -17,7 +17,7 @@ "justify": true }, { - "metric_key": "recommendation_levels", + "metric_key": "recommendation_levels_mismatches", "chart_type": "recommendation_level_indicator", "display_title": true }, diff --git a/data_compare_pack/properties.yaml b/data_compare_pack/properties.yaml index 7e3207a..fbe02f5 100644 --- a/data_compare_pack/properties.yaml +++ b/data_compare_pack/properties.yaml @@ -3,5 +3,5 @@ icon: icon.png name: data_compare type: consistency url: https://github.com/qalita-io/packs/tree/main/data_compare_pack -version: 2.0.1 +version: 2.0.12 visibility: public diff --git a/outlier_detection_pack/main.py b/outlier_detection_pack/main.py index dd583ff..bd3bf71 100644 --- a/outlier_detection_pack/main.py +++ b/outlier_detection_pack/main.py @@ -177,7 +177,6 @@ def determine_recommendation_level(proportion_outliers): } ) -pack.metrics.save() # Define a threshold for considering a data point as an outlier normality_threshold = pack.pack_config["job"][ @@ -278,6 +277,35 @@ def determine_recommendation_level(proportion_outliers): ) all_univariate_outliers = all_univariate_outliers[id_and_other_columns] + +# Extracting column labels +columnLabels = all_univariate_outliers.columns.tolist() + +# Converting the DataFrame into the desired format without row labels +data_formatted = [ + [{"value": row[col]} for col in all_univariate_outliers.columns] + for index, row in all_univariate_outliers.iterrows() +] + +# The formatted data structure, now without rowLabels +format_structure = { + "columnLabels": columnLabels, + "data": data_formatted, +} + +# Append the precision, recall, and F1 score to the metrics +pack.metrics.data.extend( + [ + { + "key": "outliers_table", + "value": format_structure, + "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, + }, + ] +) + +pack.metrics.save() + # Step 2: Compile Multivariate Outliers multivariate_outliers["index"] = ( multivariate_outliers.index diff --git a/outlier_detection_pack/pack_conf.json b/outlier_detection_pack/pack_conf.json index 06ecdae..edcea64 100644 --- a/outlier_detection_pack/pack_conf.json +++ b/outlier_detection_pack/pack_conf.json @@ -20,6 +20,11 @@ "chart_type": "text", "display_title": true, "justify": true + }, + { + "metric_key": "outliers_table", + "chart_type": "table", + "display_title": true } ], "scoped": [ diff --git a/outlier_detection_pack/properties.yaml b/outlier_detection_pack/properties.yaml index c018b2e..b93dbb1 100644 --- a/outlier_detection_pack/properties.yaml +++ b/outlier_detection_pack/properties.yaml @@ -3,5 +3,5 @@ icon: icon.png name: outlier_detection type: reasonability url: https://github.com/qalita-io/packs/tree/main/outlier_detection_pack -version: 2.0.2 +version: 2.0.5 visibility: public