Merge pull request #22 from qalita-io/Improve-Compare-Data

add precision recall compute score, export mismatches into file
qalita-io · Mar 5, 2024 · 62ecbcf · 62ecbcf
2 parents 51a987d + eabf9bf
commit 62ecbcf
Show file tree

Hide file tree

Showing 4 changed files with 209 additions and 32 deletions.
diff --git a/data_compare_pack/README.md b/data_compare_pack/README.md
@@ -5,23 +5,24 @@ The Data Comparison Pack is a robust solution designed to compare and analyze da
 
 It uses [DataComPy](https://github.com/capitalone/datacompy) library to compare the data.
 
-## Features
+## Analysis 🕵️‍♂️
 
-- **Configuration-Driven Approach**: Easy to set up through `source_conf.json`, `target_conf.json`, and `pack_conf.json` files, allowing for flexible and dynamic comparison criteria.
-- **Data Loading**: Integrated data loading mechanism using `opener.py`, ensuring secure and reliable ingestion of source and target datasets.
-- **Comprehensive Data Comparison**: Utilizes `datacompy` to perform an exhaustive comparison between source and target datasets, ensuring high accuracy in data analysis.
-- **Insightful Reporting**: Generates a detailed report highlighting differences and similarities between datasets, including DataFrame summaries, Column summaries, Row summaries, and Column comparisons.
-- **Metrics Generation**: Parses the generated report to extract key metrics, providing quantitative insights into the datasets' comparison.
-- **Score Calculation**: Computes a matching score based on the rate of target rows that match with the source, offering a clear, percentage-based metric to gauge data consistency.
-- **Resilient Error Handling**: Implements robust error handling, providing clear feedback and ensuring stability even in case of data discrepancies or configuration issues.
+| Name         | Description          | Scope   | Type    |
+| ------------ | -------------------- | ------- | ------- |
+| `score`      | Duplication score    | Dataset | `float` |
+| `precision` | the portion of rows in the target that are correctly represented in the source dataset | Dataset | `int`   |
+| `recall` | the portion of rows in the source that are correctly represented in the target dataset | Dataset | `int`   |
+| `f1_score` | the harmonic mean of precision and recall | Dataset | `int`   |
 
 ## Output Files
+
 The pack generates the following files as output, offering a comprehensive overview of the comparison:
 
 - `metrics.json`: Contains all the metrics extracted from the comparison, including the matching score and other key statistics.
-- `comparison_report.txt`: A human-readable report detailing the differences and similarities between the datasets.
+- `comparison_report.xlsx`: A human-readable report detailing the differences and similarities between the datasets.
 
 ## Usage
+
 This pack is designed to be user-friendly and can be easily integrated into your data analysis pipeline. Ensure the configuration files are set up correctly, and then execute the pack to perform the comparison and generate the metrics.
 
 # Contribute

diff --git a/data_compare_pack/main.py b/data_compare_pack/main.py
@@ -1,40 +1,91 @@
 from qalita_core.pack import Pack
 import re
 import datacompy
+from datetime import datetime
+import os
 
 pack = Pack()
 pack.load_data("source")
 pack.load_data("target")
 
+
 # Checking if the columns exist in the DataFrames
-required_columns = pack.pack_config["job"]["col_list"]
-missing_in_source = [col for col in required_columns if col not in pack.df_source.columns]
-missing_in_target = [col for col in required_columns if col not in pack.df_target.columns]
+compare_col_list = pack.pack_config["job"]["compare_col_list"]
+id_columns = pack.pack_config["job"]["id_columns"]
+abs_tol = pack.pack_config["job"].get("abs_tol", 0.0001)
+rel_tol = pack.pack_config["job"].get("rel_tol", 0)
+
+# Create an intersection of source and target columns if compare_col_list is empty
+if not compare_col_list:
+    compare_col_list = list(
+        set(pack.df_source.columns).intersection(set(pack.df_target.columns))
+    )
+
+# Include only the relevant columns and ensure they exist
+missing_in_source = [
+    col for col in compare_col_list if col not in pack.df_source.columns
+]
+missing_in_target = [
+    col for col in compare_col_list if col not in pack.df_target.columns
+]
 
 if missing_in_source:
     raise ValueError(f"Columns missing in source: {missing_in_source}")
 if missing_in_target:
     raise ValueError(f"Columns missing in target: {missing_in_target}")
 
-if missing_in_source or missing_in_target:
-    print("Comparison not performed due to missing columns.")
-    raise
+# Combine compare_col_list and id_columns while removing duplicates
+combined_columns_list = list(dict.fromkeys(compare_col_list + id_columns))
 
-# If no columns are missing, proceed with the comparison
+# Creating subsets for source and target data with no repeated columns
+df_source_subset = pack.df_source[combined_columns_list]
+df_target_subset = pack.df_target[combined_columns_list]
 
 ############################ Comparison using datacompy
 compare = datacompy.Compare(
-    pack.df_source,
-    pack.df_target,
-    join_columns=required_columns,  # Columns to join on
-    abs_tol=0,  # Absolute tolerance
-    rel_tol=0,  # Relative tolerance
+    df_source_subset,
+    df_target_subset,
+    join_columns=id_columns,  # ID COLUMN
+    abs_tol=abs_tol,  # Absolute tolerance
+    rel_tol=rel_tol,  # Relative tolerance
     df1_name=pack.source_config["name"],
     df2_name=pack.target_config["name"],
 )
 
 comparison_report = compare.report(sample_count=10, column_count=10)
 
+# Exporting comparison metrics :
+
+pack.metrics.data.append(
+    {
+        "key": "dataframe_summary_number_columns_" + pack.source_config["name"],
+        "value": compare.df1.shape[1],
+        "scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
+    }
+)
+pack.metrics.data.append(
+    {
+        "key": "dataframe_summary_number_columns_" + pack.target_config["name"],
+        "value": compare.df2.shape[1],
+        "scope": {"perimeter": "dataset", "value": pack.target_config["name"]},
+    }
+)
+pack.metrics.data.append(
+    {
+        "key": "dataframe_summary_number_rows_" + pack.source_config["name"],
+        "value": compare.df1.shape[0],
+        "scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
+    }
+)
+pack.metrics.data.append(
+    {
+        "key": "dataframe_summary_number_rows_" + pack.target_config["name"],
+        "value": compare.df2.shape[0],
+        "scope": {"perimeter": "dataset", "value": pack.target_config["name"]},
+    }
+)
+
+
 # Optionally, save the report to an HTML file
 with open("comparison_report.txt", "w") as f:
     f.write(comparison_report)
@@ -45,7 +96,6 @@
 
 # Define patterns for the parts you want to extract
 patterns = {
-    "dataframe_summary": r"DataFrame Summary\s+-+\s+([\s\S]+?)\n\n",
     "column_summary": r"Column Summary\s+-+\s+([\s\S]+?)\n\n",
     "row_summary": r"Row Summary\s+-+\s+([\s\S]+?)\n\n",
     "column_comparison": r"Column Comparison\s+-+\s+([\s\S]+?)\n\n",
@@ -57,9 +107,7 @@
     if match:
         section_content = match.group(1)
         # Extract key-value pairs
-        extracted_data[key] = dict(
-            re.findall(r"([^\n:]+):\s*(\d+)", section_content)
-        )
+        extracted_data[key] = dict(re.findall(r"([^\n:]+):\s*(\d+)", section_content))
 
 # Convert extracted data to metrics
 for section, data in extracted_data.items():
@@ -107,13 +155,22 @@
     "Number of rows in Source but not in Target"
 ]
 
+df_all_mismatch = compare.all_mismatch(ignore_matching_cols=True)
+
 # Ensure the denominator is not zero to avoid division by zero error
-total_rows_in_target = num_rows_in_common + num_rows_in_target_not_in_source
-print(f"Total rows in target: {total_rows_in_target}")
-if total_rows_in_target == 0:
+total_target_rows = len(pack.df_target)
+print(f"Total rows in target: {total_target_rows}")
+if total_target_rows == 0:
     print("Cannot compute the score as the total number of rows in target is zero.")
 else:
-    score = num_rows_in_common / total_rows_in_target
+    num_mismatches = len(df_all_mismatch)
+    if num_mismatches == 0:
+        # If there are no mismatches, the score is 1 (100% match)
+        score = 1.0
+    else:
+        # Calculate score as a ratio of matched rows to total rows in target dataframe
+        score = max(0, 1 - (num_mismatches / total_target_rows))
+
     print(f"Matching score: {score}")
 
     # Append the score to the metrics
@@ -125,4 +182,105 @@
         }
     )
 
+# Compute Precision and Recall with the available variables
+if total_target_rows == 0:
+    precision = 0  # Avoid division by zero; no rows to match in target makes precision undefined, considered as 0
+else:
+    precision = num_rows_in_common / total_target_rows
+
+total_source_rows = len(pack.df_source)
+if total_source_rows == 0:
+    recall = 0  # Similarly, avoid division by zero; no rows in source makes recall undefined, considered as 0
+else:
+    recall = num_rows_in_common / total_source_rows
+
+print(f"Precision: {precision}")
+print(f"Recall: {recall}")
+
+# Calculate the F1 score, which is the harmonic mean of precision and recall
+if precision + recall == 0:
+    f1_score = 0  # Avoid division by zero; if both precision and recall are 0, F1 is undefined, considered as 0
+else:
+    f1_score = 2 * (precision * recall) / (precision + recall)
+
+print(f"F1 Score: {f1_score}")
+
+# Append the precision, recall, and F1 score to the metrics
+pack.metrics.data.extend(
+    [
+        {
+            "key": "precision",
+            "value": str(round(precision, 2)),
+            "scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
+        },
+        {
+            "key": "recall",
+            "value": str(round(recall, 2)),
+            "scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
+        },
+        {
+            "key": "f1_score",
+            "value": str(round(f1_score, 2)),
+            "scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
+        },
+    ]
+)
+
+
+# Extracting column labels
+columnLabels = df_all_mismatch.columns.tolist()
+
+# Converting the DataFrame into the desired format without row labels
+data_formatted = [
+    [{"value": row[col]} for col in df_all_mismatch.columns]
+    for index, row in df_all_mismatch.iterrows()
+]
+
+# The formatted data structure, now without rowLabels
+format_structure = {
+    "columnLabels": columnLabels,
+    "data": data_formatted,
+}
+
+# Append the precision, recall, and F1 score to the metrics
+pack.metrics.data.extend(
+    [
+        {
+            "key": "recommendation_levels_mismatches",
+            "value": {"info": "<=0.5", "warning": ">0.5", "high": ">0.8"},
+            "scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
+        },
+        {
+            "key": "check_column",
+            "value": [combined_columns_list],
+            "scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
+        },
+        {
+            "key": "mismatches_table",
+            "value": format_structure,
+            "scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
+        },
+    ]
+)
+
 pack.metrics.save()
+
+######################## Export:
+# Check if there are any mismatches
+
+if df_all_mismatch.empty:
+    print("No mismatches found. No report will be generated.")
+else:
+    if pack.source_config["type"] == "file":
+        source_file_dir = os.path.dirname(pack.source_config["config"]["path"])
+        current_date = datetime.now().strftime("%Y%m%d")
+        report_file_path = os.path.join(
+            source_file_dir,
+            f'{current_date}_data_compare_report_{pack.source_config["name"]}.xlsx',
+        )
+
+        # Export mismatches rows to an Excel file
+        df_all_mismatch.to_excel(
+            report_file_path, index=False
+        )  # Set index=False as 'original_index' is now a column
+        print(f"mismatches rows have been exported to {report_file_path}")
diff --git a/data_compare_pack/pack_conf.json b/data_compare_pack/pack_conf.json
@@ -1,6 +1,9 @@
 {
     "job": {
-        "col_list": [],
+        "compare_col_list": [],
+        "id_columns": [],
+        "abs_tol": 0.0001,
+        "rel_tol": 0,
         "source": {
             "skiprows": 0
         }
@@ -12,7 +15,22 @@
                 "chart_type": "text",
                 "display_title": true,
                 "justify": true
+            },
+            {
+                "metric_key": "recommendation_levels",
+                "chart_type": "recommendation_level_indicator",
+                "display_title": true
+            },
+            {
+                "metric_key": "check_column",
+                "chart_type": "check_table",
+                "display_title": true
+            },
+            {
+                "metric_key": "mismatches_table",
+                "chart_type": "table",
+                "display_title": true
             }
         ]
     }
-}
+}
diff --git a/data_compare_pack/properties.yaml b/data_compare_pack/properties.yaml
@@ -3,5 +3,5 @@ icon: icon.png
 name: data_compare
 type: consistency
 url: https://github.com/qalita-io/packs/tree/main/data_compare_pack
-version: 2.0.0
+version: 2.0.1
 visibility: public