From eabf9bfab5abff6b4477baf72e40ee6aaec45676 Mon Sep 17 00:00:00 2001 From: Armand LEOPOLD Date: Sun, 3 Mar 2024 23:46:41 +0100 Subject: [PATCH] add precision recall compute score, export mismatches into file --- data_compare_pack/README.md | 19 +-- data_compare_pack/main.py | 198 +++++++++++++++++++++++++++--- data_compare_pack/pack_conf.json | 22 +++- data_compare_pack/properties.yaml | 2 +- 4 files changed, 209 insertions(+), 32 deletions(-) diff --git a/data_compare_pack/README.md b/data_compare_pack/README.md index 0e47df3..d3ef46e 100644 --- a/data_compare_pack/README.md +++ b/data_compare_pack/README.md @@ -5,23 +5,24 @@ The Data Comparison Pack is a robust solution designed to compare and analyze da It uses [DataComPy](https://github.com/capitalone/datacompy) library to compare the data. -## Features +## Analysis 🕵️‍♂️ -- **Configuration-Driven Approach**: Easy to set up through `source_conf.json`, `target_conf.json`, and `pack_conf.json` files, allowing for flexible and dynamic comparison criteria. -- **Data Loading**: Integrated data loading mechanism using `opener.py`, ensuring secure and reliable ingestion of source and target datasets. -- **Comprehensive Data Comparison**: Utilizes `datacompy` to perform an exhaustive comparison between source and target datasets, ensuring high accuracy in data analysis. -- **Insightful Reporting**: Generates a detailed report highlighting differences and similarities between datasets, including DataFrame summaries, Column summaries, Row summaries, and Column comparisons. -- **Metrics Generation**: Parses the generated report to extract key metrics, providing quantitative insights into the datasets' comparison. -- **Score Calculation**: Computes a matching score based on the rate of target rows that match with the source, offering a clear, percentage-based metric to gauge data consistency. -- **Resilient Error Handling**: Implements robust error handling, providing clear feedback and ensuring stability even in case of data discrepancies or configuration issues. +| Name | Description | Scope | Type | +| ------------ | -------------------- | ------- | ------- | +| `score` | Duplication score | Dataset | `float` | +| `precision` | the portion of rows in the target that are correctly represented in the source dataset | Dataset | `int` | +| `recall` | the portion of rows in the source that are correctly represented in the target dataset | Dataset | `int` | +| `f1_score` | the harmonic mean of precision and recall | Dataset | `int` | ## Output Files + The pack generates the following files as output, offering a comprehensive overview of the comparison: - `metrics.json`: Contains all the metrics extracted from the comparison, including the matching score and other key statistics. -- `comparison_report.txt`: A human-readable report detailing the differences and similarities between the datasets. +- `comparison_report.xlsx`: A human-readable report detailing the differences and similarities between the datasets. ## Usage + This pack is designed to be user-friendly and can be easily integrated into your data analysis pipeline. Ensure the configuration files are set up correctly, and then execute the pack to perform the comparison and generate the metrics. # Contribute diff --git a/data_compare_pack/main.py b/data_compare_pack/main.py index 3571edd..efadc3e 100644 --- a/data_compare_pack/main.py +++ b/data_compare_pack/main.py @@ -1,40 +1,91 @@ from qalita_core.pack import Pack import re import datacompy +from datetime import datetime +import os pack = Pack() pack.load_data("source") pack.load_data("target") + # Checking if the columns exist in the DataFrames -required_columns = pack.pack_config["job"]["col_list"] -missing_in_source = [col for col in required_columns if col not in pack.df_source.columns] -missing_in_target = [col for col in required_columns if col not in pack.df_target.columns] +compare_col_list = pack.pack_config["job"]["compare_col_list"] +id_columns = pack.pack_config["job"]["id_columns"] +abs_tol = pack.pack_config["job"].get("abs_tol", 0.0001) +rel_tol = pack.pack_config["job"].get("rel_tol", 0) + +# Create an intersection of source and target columns if compare_col_list is empty +if not compare_col_list: + compare_col_list = list( + set(pack.df_source.columns).intersection(set(pack.df_target.columns)) + ) + +# Include only the relevant columns and ensure they exist +missing_in_source = [ + col for col in compare_col_list if col not in pack.df_source.columns +] +missing_in_target = [ + col for col in compare_col_list if col not in pack.df_target.columns +] if missing_in_source: raise ValueError(f"Columns missing in source: {missing_in_source}") if missing_in_target: raise ValueError(f"Columns missing in target: {missing_in_target}") -if missing_in_source or missing_in_target: - print("Comparison not performed due to missing columns.") - raise +# Combine compare_col_list and id_columns while removing duplicates +combined_columns_list = list(dict.fromkeys(compare_col_list + id_columns)) -# If no columns are missing, proceed with the comparison +# Creating subsets for source and target data with no repeated columns +df_source_subset = pack.df_source[combined_columns_list] +df_target_subset = pack.df_target[combined_columns_list] ############################ Comparison using datacompy compare = datacompy.Compare( - pack.df_source, - pack.df_target, - join_columns=required_columns, # Columns to join on - abs_tol=0, # Absolute tolerance - rel_tol=0, # Relative tolerance + df_source_subset, + df_target_subset, + join_columns=id_columns, # ID COLUMN + abs_tol=abs_tol, # Absolute tolerance + rel_tol=rel_tol, # Relative tolerance df1_name=pack.source_config["name"], df2_name=pack.target_config["name"], ) comparison_report = compare.report(sample_count=10, column_count=10) +# Exporting comparison metrics : + +pack.metrics.data.append( + { + "key": "dataframe_summary_number_columns_" + pack.source_config["name"], + "value": compare.df1.shape[1], + "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, + } +) +pack.metrics.data.append( + { + "key": "dataframe_summary_number_columns_" + pack.target_config["name"], + "value": compare.df2.shape[1], + "scope": {"perimeter": "dataset", "value": pack.target_config["name"]}, + } +) +pack.metrics.data.append( + { + "key": "dataframe_summary_number_rows_" + pack.source_config["name"], + "value": compare.df1.shape[0], + "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, + } +) +pack.metrics.data.append( + { + "key": "dataframe_summary_number_rows_" + pack.target_config["name"], + "value": compare.df2.shape[0], + "scope": {"perimeter": "dataset", "value": pack.target_config["name"]}, + } +) + + # Optionally, save the report to an HTML file with open("comparison_report.txt", "w") as f: f.write(comparison_report) @@ -45,7 +96,6 @@ # Define patterns for the parts you want to extract patterns = { - "dataframe_summary": r"DataFrame Summary\s+-+\s+([\s\S]+?)\n\n", "column_summary": r"Column Summary\s+-+\s+([\s\S]+?)\n\n", "row_summary": r"Row Summary\s+-+\s+([\s\S]+?)\n\n", "column_comparison": r"Column Comparison\s+-+\s+([\s\S]+?)\n\n", @@ -57,9 +107,7 @@ if match: section_content = match.group(1) # Extract key-value pairs - extracted_data[key] = dict( - re.findall(r"([^\n:]+):\s*(\d+)", section_content) - ) + extracted_data[key] = dict(re.findall(r"([^\n:]+):\s*(\d+)", section_content)) # Convert extracted data to metrics for section, data in extracted_data.items(): @@ -107,13 +155,22 @@ "Number of rows in Source but not in Target" ] +df_all_mismatch = compare.all_mismatch(ignore_matching_cols=True) + # Ensure the denominator is not zero to avoid division by zero error -total_rows_in_target = num_rows_in_common + num_rows_in_target_not_in_source -print(f"Total rows in target: {total_rows_in_target}") -if total_rows_in_target == 0: +total_target_rows = len(pack.df_target) +print(f"Total rows in target: {total_target_rows}") +if total_target_rows == 0: print("Cannot compute the score as the total number of rows in target is zero.") else: - score = num_rows_in_common / total_rows_in_target + num_mismatches = len(df_all_mismatch) + if num_mismatches == 0: + # If there are no mismatches, the score is 1 (100% match) + score = 1.0 + else: + # Calculate score as a ratio of matched rows to total rows in target dataframe + score = max(0, 1 - (num_mismatches / total_target_rows)) + print(f"Matching score: {score}") # Append the score to the metrics @@ -125,4 +182,105 @@ } ) +# Compute Precision and Recall with the available variables +if total_target_rows == 0: + precision = 0 # Avoid division by zero; no rows to match in target makes precision undefined, considered as 0 +else: + precision = num_rows_in_common / total_target_rows + +total_source_rows = len(pack.df_source) +if total_source_rows == 0: + recall = 0 # Similarly, avoid division by zero; no rows in source makes recall undefined, considered as 0 +else: + recall = num_rows_in_common / total_source_rows + +print(f"Precision: {precision}") +print(f"Recall: {recall}") + +# Calculate the F1 score, which is the harmonic mean of precision and recall +if precision + recall == 0: + f1_score = 0 # Avoid division by zero; if both precision and recall are 0, F1 is undefined, considered as 0 +else: + f1_score = 2 * (precision * recall) / (precision + recall) + +print(f"F1 Score: {f1_score}") + +# Append the precision, recall, and F1 score to the metrics +pack.metrics.data.extend( + [ + { + "key": "precision", + "value": str(round(precision, 2)), + "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, + }, + { + "key": "recall", + "value": str(round(recall, 2)), + "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, + }, + { + "key": "f1_score", + "value": str(round(f1_score, 2)), + "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, + }, + ] +) + + +# Extracting column labels +columnLabels = df_all_mismatch.columns.tolist() + +# Converting the DataFrame into the desired format without row labels +data_formatted = [ + [{"value": row[col]} for col in df_all_mismatch.columns] + for index, row in df_all_mismatch.iterrows() +] + +# The formatted data structure, now without rowLabels +format_structure = { + "columnLabels": columnLabels, + "data": data_formatted, +} + +# Append the precision, recall, and F1 score to the metrics +pack.metrics.data.extend( + [ + { + "key": "recommendation_levels_mismatches", + "value": {"info": "<=0.5", "warning": ">0.5", "high": ">0.8"}, + "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, + }, + { + "key": "check_column", + "value": [combined_columns_list], + "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, + }, + { + "key": "mismatches_table", + "value": format_structure, + "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, + }, + ] +) + pack.metrics.save() + +######################## Export: +# Check if there are any mismatches + +if df_all_mismatch.empty: + print("No mismatches found. No report will be generated.") +else: + if pack.source_config["type"] == "file": + source_file_dir = os.path.dirname(pack.source_config["config"]["path"]) + current_date = datetime.now().strftime("%Y%m%d") + report_file_path = os.path.join( + source_file_dir, + f'{current_date}_data_compare_report_{pack.source_config["name"]}.xlsx', + ) + + # Export mismatches rows to an Excel file + df_all_mismatch.to_excel( + report_file_path, index=False + ) # Set index=False as 'original_index' is now a column + print(f"mismatches rows have been exported to {report_file_path}") diff --git a/data_compare_pack/pack_conf.json b/data_compare_pack/pack_conf.json index 54ff063..cb91810 100644 --- a/data_compare_pack/pack_conf.json +++ b/data_compare_pack/pack_conf.json @@ -1,6 +1,9 @@ { "job": { - "col_list": [], + "compare_col_list": [], + "id_columns": [], + "abs_tol": 0.0001, + "rel_tol": 0, "source": { "skiprows": 0 } @@ -12,7 +15,22 @@ "chart_type": "text", "display_title": true, "justify": true + }, + { + "metric_key": "recommendation_levels", + "chart_type": "recommendation_level_indicator", + "display_title": true + }, + { + "metric_key": "check_column", + "chart_type": "check_table", + "display_title": true + }, + { + "metric_key": "mismatches_table", + "chart_type": "table", + "display_title": true } ] } -} +} \ No newline at end of file diff --git a/data_compare_pack/properties.yaml b/data_compare_pack/properties.yaml index 3666314..7e3207a 100644 --- a/data_compare_pack/properties.yaml +++ b/data_compare_pack/properties.yaml @@ -3,5 +3,5 @@ icon: icon.png name: data_compare type: consistency url: https://github.com/qalita-io/packs/tree/main/data_compare_pack -version: 2.0.0 +version: 2.0.1 visibility: public