Merge pull request #23 from qalita-io/Improve-Compare-Data

Improve compare data
qalita-io · Mar 17, 2024 · d705b2d · d705b2d
2 parents 62ecbcf + 7ba3c51
commit d705b2d
Show file tree

Hide file tree

Showing 6 changed files with 129 additions and 50 deletions.
diff --git a/data_compare_pack/main.py b/data_compare_pack/main.py
@@ -10,13 +10,13 @@
 
 
 # Checking if the columns exist in the DataFrames
-compare_col_list = pack.pack_config["job"]["compare_col_list"]
-id_columns = pack.pack_config["job"]["id_columns"]
+compare_col_list = pack.pack_config["job"].get("compare_col_list", [])
+id_columns = pack.pack_config["job"].get("id_columns", [])
 abs_tol = pack.pack_config["job"].get("abs_tol", 0.0001)
 rel_tol = pack.pack_config["job"].get("rel_tol", 0)
 
 # Create an intersection of source and target columns if compare_col_list is empty
-if not compare_col_list:
+if compare_col_list == []:
     compare_col_list = list(
         set(pack.df_source.columns).intersection(set(pack.df_target.columns))
     )
@@ -37,6 +37,9 @@
 # Combine compare_col_list and id_columns while removing duplicates
 combined_columns_list = list(dict.fromkeys(compare_col_list + id_columns))
 
+if len(id_columns) == 0:
+    id_columns = compare_col_list
+
 # Creating subsets for source and target data with no repeated columns
 df_source_subset = pack.df_source[combined_columns_list]
 df_target_subset = pack.df_target[combined_columns_list]
@@ -226,19 +229,38 @@
     ]
 )
 
-
 # Extracting column labels
 columnLabels = df_all_mismatch.columns.tolist()
 
-# Converting the DataFrame into the desired format without row labels
+# Dictionary to map the old suffix to the new one
+suffix_mapping = {"_df1": "_source", "_df2": "_target"}
+
+# Revise the loop to correctly process replacement without duplication
+new_columnLabels = [
+    (
+        col
+        if not any(col.endswith(suffix) for suffix in suffix_mapping.keys())
+        else next(
+            col.replace(suffix, replacement)
+            for suffix, replacement in suffix_mapping.items()
+            if col.endswith(suffix)
+        )
+    )
+    for col in columnLabels
+]
+
+# Assuming `df_all_mismatch` is your DataFrame, rename its columns with the new labels
+df_all_mismatch.columns = new_columnLabels
+
+# Since you've updated column names, you don't need to change the way you convert the DataFrame
 data_formatted = [
     [{"value": row[col]} for col in df_all_mismatch.columns]
     for index, row in df_all_mismatch.iterrows()
 ]
 
-# The formatted data structure, now without rowLabels
+# The formatted data structure, now with renamed labels
 format_structure = {
-    "columnLabels": columnLabels,
+    "columnLabels": new_columnLabels,  # Use the new column labels
     "data": data_formatted,
 }
 
@@ -247,12 +269,7 @@
     [
         {
             "key": "recommendation_levels_mismatches",
-            "value": {"info": "<=0.5", "warning": ">0.5", "high": ">0.8"},
-            "scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
-        },
-        {
-            "key": "check_column",
-            "value": [combined_columns_list],
+            "value": {"info": "0", "warning": "0.5", "high": "0.8"},
             "scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
         },
         {

diff --git a/data_compare_pack/pack_conf.json b/data_compare_pack/pack_conf.json
@@ -1,36 +1,65 @@
 {
-    "job": {
-        "compare_col_list": [],
-        "id_columns": [],
-        "abs_tol": 0.0001,
-        "rel_tol": 0,
-        "source": {
-            "skiprows": 0
-        }
-    },
-    "charts": {
-        "overview": [
-            {
-                "metric_key": "score",
-                "chart_type": "text",
-                "display_title": true,
-                "justify": true
-            },
-            {
-                "metric_key": "recommendation_levels",
-                "chart_type": "recommendation_level_indicator",
-                "display_title": true
-            },
-            {
-                "metric_key": "check_column",
-                "chart_type": "check_table",
-                "display_title": true
-            },
-            {
-                "metric_key": "mismatches_table",
-                "chart_type": "table",
-                "display_title": true
-            }
-        ]
+  "job": {
+    "compare_col_list": [],
+    "id_columns": [],
+    "abs_tol": 0.0001,
+    "rel_tol": 0,
+    "source": {
+      "skiprows": 0
     }
-}
+  },
+  "charts": {
+    "overview": [
+      {
+        "metric_key": "score",
+        "chart_type": "text",
+        "tooltip": {
+          "title": "Comparison Score [Higher is better]",
+          "content": "Proportion of data in source that matches the target."
+        },
+        "display_title": true,
+        "justify": true
+      },
+      {
+        "metric_key": "precision",
+        "chart_type": "text",
+        "tooltip": {
+          "title": "Precision [Higher is better]",
+          "content": "The portion of rows in the target that are correctly represented in the source dataset"
+        },
+        "display_title": true,
+        "justify": true
+      },
+      {
+        "metric_key": "recall",
+        "chart_type": "text",
+        "tooltip": {
+          "title": "Recall [Higher is better]",
+          "content": "The portion of rows in the source that are correctly represented in the target dataset"
+        },
+        "display_title": true,
+        "justify": true
+      },
+      {
+        "metric_key": "score",
+        "chart_type": "text",
+        "display_title": true,
+        "justify": true
+      },
+      {
+        "metric_key": "recommendation_levels_mismatches",
+        "chart_type": "recommendation_level_indicator",
+        "tooltip": {
+          "title": "Recommendation level's importance mapping",
+          "content": "Gives the recommendation level for proportions of mismatches"
+        },
+        "display_title": true
+      },
+      {
+        "metric_key": "mismatches_table",
+        "chart_type": "table",
+        "display_title": true
+      }
+    ]
+  }
+}
diff --git a/data_compare_pack/properties.yaml b/data_compare_pack/properties.yaml
@@ -3,5 +3,5 @@ icon: icon.png
 name: data_compare
 type: consistency
 url: https://github.com/qalita-io/packs/tree/main/data_compare_pack
-version: 2.0.1
+version: 2.0.21
 visibility: public
diff --git a/outlier_detection_pack/main.py b/outlier_detection_pack/main.py
@@ -177,7 +177,6 @@ def determine_recommendation_level(proportion_outliers):
     }
 )
 
-pack.metrics.save()
 
 # Define a threshold for considering a data point as an outlier
 normality_threshold = pack.pack_config["job"][
@@ -278,6 +277,35 @@ def determine_recommendation_level(proportion_outliers):
 )
 all_univariate_outliers = all_univariate_outliers[id_and_other_columns]
 
+
+# Extracting column labels
+columnLabels = all_univariate_outliers.columns.tolist()
+
+# Converting the DataFrame into the desired format without row labels
+data_formatted = [
+    [{"value": row[col]} for col in all_univariate_outliers.columns]
+    for index, row in all_univariate_outliers.iterrows()
+]
+
+# The formatted data structure, now without rowLabels
+format_structure = {
+    "columnLabels": columnLabels,
+    "data": data_formatted,
+}
+
+# Append the precision, recall, and F1 score to the metrics
+pack.metrics.data.extend(
+    [
+        {
+            "key": "outliers_table",
+            "value": format_structure,
+            "scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
+        },
+    ]
+)
+
+pack.metrics.save()
+
 # Step 2: Compile Multivariate Outliers
 multivariate_outliers["index"] = (
     multivariate_outliers.index

diff --git a/outlier_detection_pack/pack_conf.json b/outlier_detection_pack/pack_conf.json
@@ -20,6 +20,11 @@
                 "chart_type": "text",
                 "display_title": true,
                 "justify": true
+            },
+            {
+                "metric_key": "outliers_table",
+                "chart_type": "table",
+                "display_title": true
             }
         ],
         "scoped": [

diff --git a/outlier_detection_pack/properties.yaml b/outlier_detection_pack/properties.yaml
@@ -3,5 +3,5 @@ icon: icon.png
 name: outlier_detection
 type: reasonability
 url: https://github.com/qalita-io/packs/tree/main/outlier_detection_pack
-version: 2.0.2
+version: 2.0.5
 visibility: public