Merge pull request #13 from qalita-io/dev

Dev
qalita-io · Feb 19, 2024 · d0de3d2 · d0de3d2
2 parents b104211 + 3bc6aac
commit d0de3d2
Show file tree

Hide file tree

Showing 29 changed files with 615 additions and 91 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -25,6 +25,8 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: '3.x'
+      - name: Update pip
+        run: pip install --upgrade pip
       - name: Install safety
         run: pip install poetry safety
       - name: Run safety check

diff --git a/LICENSE b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright [2024] [QALITA SAS]
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

diff --git a/accuracy_pack/README.md b/accuracy_pack/README.md
@@ -1,42 +1,39 @@
 # Accuracy
 
 ## Overview
-This pack assesses the precision of float columns within a dataset, providing a granular view of data quality. The script computes the maximum number of decimal places for each float column and generates a normalized score representing the precision level of the data. The results are saved in `metrics.json`, with each float column's precision score detailed individually.
 
-## Features
+This pack assesses the precision of float columns within a dataset, providing a granular view of data quality. The script computes the maximum number of decimal places for each float column and generates a normalized score representing the precision level of the data.
+
+## Input 📥
+
+### Configuration ⚙️
+
+| Name                   | Type   | Required | Default | Description                                              |
+| ---------------------- | ------ | -------- | ------- | -------------------------------------------------------- |
+| `jobs.source.skiprows` | `int`  | no       | `0`     | The number of rows to skip at the beginning of the file. |
+| `jobs.id_columns`      | `list` | no       | `[]`    | The list of columns to use as identifier.                |
+
+### Source type compatibility 🧩
+
+This pack is compatible with **files** 📁 (``csv``, ``xslx``).
+
+## Analysis 🕵️‍♂️
+
 - **Precision Calculation**: Computes the maximum number of decimal places for each float value in float columns.
 - **Score Normalization**: Normalizes the precision values to a 0-1 scale, providing a standardized precision score for each column.
-- **Metrics Generation**: Outputs a `metrics.json` file containing precision scores for each float column, enhancing the interpretability of data quality.
-
-## Setup
-Before running the script, ensure that the following files are properly configured:
-- `source_conf.json`: Configuration file for the source data.
-- `pack_conf.json`: Configuration file for the pack.
-- Data file: The data to be analyzed, lo aded using `opener.py`.
-
-## Usage
-To use this pack, follow these steps:
-1. Ensure all prerequisite files (`source_conf.json`, `pack_conf.json`, and the data file) are in place.
-2. Run the script with the appropriate Python interpreter.
-3. Review the generated `metrics.json` for precision metrics of the dataset.
-
-## Output
-- `metrics.json`: Contains precision scores for each float column in the dataset. The structure of the output is as follows:
-
-```json
-[
-    {
-        "key": "decimal_precision",
-        "value": "<precision_score>",
-        "scope": {
-            "perimeter": "column",
-            "value": "<column_name>"
-        },
-    },
-    ...
-]
-```
-
-# Contribute
-
-[This pack is part of Qalita Open Source Assets (QOSA) and is open to contribution. You can help us improve this pack by forking it and submitting a pull request here.](https://github.com/qalita-io/packs)
+
+| Name                | Description                                       | Scope   | Type    |
+| ------------------- | ------------------------------------------------- | ------- | ------- |
+| `score`             | Accuracy score                                    | Dataset | `float` |
+| `decimal_precision` | Number of maximum decimals seen for this variable | Column  | `int`   |
+| `proportion_score`  | Proportion of values with maximum decimals        | Column  | `float` |
+
+## Output 📤
+
+### Report 📊
+
+This pack doesn't generate any output or report.
+
+# Contribute 💡
+
+[This pack is part of Qalita Open Source Assets (QOSA) and is open to contribution. You can help us improve this pack by forking it and submitting a pull request here.](https://github.com/qalita-io/packs) 👥🚀
diff --git a/accuracy_pack/main.py b/accuracy_pack/main.py
@@ -1,5 +1,7 @@
 import json
 import utils
+import os
+from datetime import datetime
 
 ########################### Loading Data
 
@@ -30,13 +32,16 @@ def compute_metrics(df):
     # Compute precision score for each float column
     precision_data = []
     total_proportion_score = 0  # Initialize total proportion score
+    valid_columns_count = 0  # Count of columns that have at least one non-NaN value
 
     for column in float_columns:
-        decimals_count = (
-            df[column]
-            .dropna()
-            .apply(lambda x: len(str(x).split(".")[1]) if "." in str(x) else 0)
-        )
+        column_data = df[column].dropna()
+
+        # Skip the column if it only contains NaN values
+        if column_data.empty:
+            continue
+
+        decimals_count = column_data.apply(lambda x: len(str(x).split(".")[1]) if "." in str(x) else 0)
         max_decimals = decimals_count.max()
         most_common_decimals_series = decimals_count.mode()
 
@@ -46,37 +51,32 @@ def compute_metrics(df):
             most_common_decimals = 0
             proportion_score = 0
         else:
-            most_common_decimals = most_common_decimals_series[
-                0
-            ]  # Get the most common decimals count
-            proportion_score = decimals_count[
-                decimals_count == most_common_decimals
-            ].count() / len(decimals_count)
+            most_common_decimals = most_common_decimals_series[0]  # Get the most common decimals count
+            proportion_score = decimals_count[decimals_count == most_common_decimals].count() / len(decimals_count)
 
         total_proportion_score += proportion_score  # Add proportion score to the total
+        valid_columns_count += 1  # Increment valid columns count
 
-        precision_data.append(
-            {
-                "key": "decimal_precision",
-                "value": str(max_decimals),  # Maximum number of decimals
-                "scope": {"perimeter": "column", "value": column},
-            }
-        )
+        if max_decimals > 0:
+            precision_data.append(
+                {
+                    "key": "decimal_precision",
+                    "value": str(max_decimals),  # Maximum number of decimals
+                    "scope": {"perimeter": "column", "value": column},
+                }
+            )
 
+        # Always include proportion_score in precision_data even if max_decimals is 0
         precision_data.append(
             {
                 "key": "proportion_score",
-                "value": str(
-                    round(proportion_score, 2)
-                ),  # Proportion of values with the most common decimals count
+                "value": str(round(proportion_score, 2)),  # Proportion of values with the most common decimals count
                 "scope": {"perimeter": "column", "value": column},
             }
         )
 
     # Calculate the mean of proportion scores
-    mean_proportion_score = (
-        total_proportion_score / len(float_columns) if float_columns.any() else 0
-    )
+    mean_proportion_score = total_proportion_score / valid_columns_count if valid_columns_count > 0 else 0
 
     # Add the mean proportion score to the precision data
     precision_data.append(
@@ -102,12 +102,24 @@ def compute_metrics(df):
             if proportion_score < 0.9:
                 recommendation = {
                     "content": f"Column '{column}' has {(1-proportion_score)*100:.2f}% of data that are not rounded to the same number of decimals.",
-                    "type": "Duplicates",
+                    "type": "Unevenly Rounded Data",
                     "scope": {"perimeter": "column", "value": column},
                     "level": utils.determine_recommendation_level(1 - proportion_score),
                 }
                 recommendations.append(recommendation)
 
+# Recommendation for the dataset
+if precision_metrics:
+    mean_proportion_score = float(precision_metrics[-1]["value"])
+    if mean_proportion_score < 0.9:
+        recommendation = {
+            "content": f"The dataset has {(1-mean_proportion_score)*100:.2f}% of data that are not rounded to the same number of decimals.",
+            "type": "Unevenly Rounded Data",
+            "scope": {"perimeter": "dataset", "value": source_config["name"]},
+            "level": utils.determine_recommendation_level(1 - mean_proportion_score),
+        }
+        recommendations.append(recommendation)
+
 ############################ Writing Metrics and Recommendations to Files
 
 if precision_metrics is not None:
@@ -119,3 +131,46 @@ def compute_metrics(df):
     with open("recommendations.json", "w", encoding="utf-8") as f:
         json.dump(recommendations, f, indent=4)
     print("recommendations.json file created successfully.")
+
+
+# ######################## Export:
+# # Step 1: Filter the DataFrame based on precision recommendations
+
+# id_columns = pack_config.get('job', {}).get('id_columns', [])
+
+# # For simplicity, let's assume that columns with a proportion score lower than 0.9 need attention
+# columns_to_check = [item["scope"]["value"] for item in precision_metrics if item["key"] == "proportion_score" and float(item["value"]) < 0.9]
+
+# # Filter the DataFrame for rows that don't meet the rounding criteria in the specified columns
+# expected_precision = float(precision_metrics[1]["value"])
+# rows_with_rounding_issues = df[df[columns_to_check].applymap(lambda x: isinstance(x, float) and (len(str(x).split(".")[1]) if '.' in str(x) else 0) != expected_precision)]
+
+# # Check if there are rows with rounding issues
+# if rows_with_rounding_issues.empty:
+#     print("No rounding issues found. No report will be generated.")
+# else:
+#     # If there are rows with rounding issues, proceed with sorting and exporting
+#     rows_with_rounding_issues = rows_with_rounding_issues.sort_values(by=columns_to_check)
+
+#     # Step 3: Set index or create 'index' column for the Excel export
+#     if id_columns:
+#         # Ensure all id_columns are in the DataFrame columns
+#         valid_id_columns = [col for col in id_columns if col in rows_with_rounding_issues.columns]
+#         if not valid_id_columns:
+#             print("None of the specified 'id_columns' are in the DataFrame. Using default index.")
+#             rows_with_rounding_issues = rows_with_rounding_issues.reset_index(drop=True)
+#         else:
+#             rows_with_rounding_issues = rows_with_rounding_issues.set_index(valid_id_columns)
+#     else:
+#         # If 'id_columns' is not provided or is empty, create an 'index' column with the original DataFrame's index
+#         rows_with_rounding_issues = rows_with_rounding_issues.reset_index()
+
+#     # Continue with the export process
+#     if source_config['type'] == 'file':
+#         source_file_dir = os.path.dirname(source_config['config']['path'])
+#         current_date = datetime.now().strftime("%Y%m%d")
+#         report_file_path = os.path.join(source_file_dir, f'rounding_issues_report_{source_config["name"]}_{current_date}.xlsx')
+
+#         # Export rows with rounding issues to an Excel file
+#         rows_with_rounding_issues.to_excel(report_file_path, index=False)  # Set index=False as 'original_index' is now a column
+#         print(f"Rows with rounding issues have been exported to {report_file_path}")
diff --git a/accuracy_pack/pack_conf.json b/accuracy_pack/pack_conf.json
@@ -1,5 +1,6 @@
 {
     "job": {
+        "id_columns": [],
         "source": {
             "skiprows": 0
         }

diff --git a/accuracy_pack/properties.yaml b/accuracy_pack/properties.yaml
@@ -1,7 +1,7 @@
-description: Compute accuracy metrics
+description: Compute decimal accuracy metrics
 icon: icon.png
 name: accuracy
 type: accuracy
 url: https://github.com/qalita-io/packs/tree/main/accuracy_pack
-version: 1.1.0
+version: 1.1.13
 visibility: public
diff --git a/data_compare_pack/LICENSE b/data_compare_pack/LICENSE
@@ -222,7 +222,7 @@ Dependency : [DataComPy License](https://github.com/capitalone/datacompy/blob/de
         same "printed page" as the copyright notice for easier
         identification within third-party archives.
 
-    Copyright [yyyy] [name of copyright owner]
+    Copyright [2024] [QALITA SAS]
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.

diff --git a/data_compare_pack/README.md b/data_compare_pack/README.md
@@ -8,24 +8,17 @@ It uses [DataComPy](https://github.com/capitalone/datacompy) library to compare
 ## Features
 
 - **Configuration-Driven Approach**: Easy to set up through `source_conf.json`, `target_conf.json`, and `pack_conf.json` files, allowing for flexible and dynamic comparison criteria.
-
 - **Data Loading**: Integrated data loading mechanism using `opener.py`, ensuring secure and reliable ingestion of source and target datasets.
-
 - **Comprehensive Data Comparison**: Utilizes `datacompy` to perform an exhaustive comparison between source and target datasets, ensuring high accuracy in data analysis.
-
 - **Insightful Reporting**: Generates a detailed report highlighting differences and similarities between datasets, including DataFrame summaries, Column summaries, Row summaries, and Column comparisons.
-
 - **Metrics Generation**: Parses the generated report to extract key metrics, providing quantitative insights into the datasets' comparison.
-
 - **Score Calculation**: Computes a matching score based on the rate of target rows that match with the source, offering a clear, percentage-based metric to gauge data consistency.
-
 - **Resilient Error Handling**: Implements robust error handling, providing clear feedback and ensuring stability even in case of data discrepancies or configuration issues.
 
 ## Output Files
 The pack generates the following files as output, offering a comprehensive overview of the comparison:
 
 - `metrics.json`: Contains all the metrics extracted from the comparison, including the matching score and other key statistics.
-
 - `comparison_report.txt`: A human-readable report detailing the differences and similarities between the datasets.
 
 ## Usage

diff --git a/data_compare_pack/properties.yaml b/data_compare_pack/properties.yaml
@@ -3,5 +3,5 @@ icon: icon.png
 name: data_compare
 type: consistency
 url: https://github.com/qalita-io/packs/tree/main/data_compare_pack
-version: 1.1.0
+version: 1.1.1
 visibility: public
diff --git a/duplicates_finder_pack/README.md b/duplicates_finder_pack/README.md
@@ -1,7 +1,5 @@
 # Duplicates Finder
 
-![Duplicates Finder](https://raw.githubusercontent.com/qalita-io/packs/dev/duplicates_finder_pack/duplicates_finder_banner.png)
-
 Duplicates finder searches for duplicates data and computes metrics.
 
 ## Input 📥
@@ -33,7 +31,7 @@ This pack is compatible with **files** 📁 (``csv``, ``xslx``) and **databases*
 
 The report exports the duplicated data by adding the id column, and groupy by duplicates and sorting them.
 
-Filename is `duplicates_report_{source_config["name"]}_{current_date}.xlsx`
+Filename is `{current_date}_duplicates_finder_report_{source_config["name"]}_.xlsx`
 
 # Contribute 💡
 

diff --git a/duplicates_finder_pack/duplicates_finder_banner.png b/duplicates_finder_pack/duplicates_finder_banner.png
diff --git a/duplicates_finder_pack/main.py b/duplicates_finder_pack/main.py
@@ -150,7 +150,7 @@
     if source_config['type'] == 'file':
         source_file_dir = os.path.dirname(source_config['config']['path'])
         current_date = datetime.now().strftime("%Y%m%d")
-        report_file_path = os.path.join(source_file_dir, f'duplicates_report_{source_config["name"]}_{current_date}.xlsx')
+        report_file_path = os.path.join(source_file_dir, f'{current_date}_duplicates_finder_report_{source_config["name"]}.xlsx')
 
         # Export duplicated rows to an Excel file
         duplicated_rows.to_excel(report_file_path, index=False)  # Set index=False as 'original_index' is now a column

diff --git a/duplicates_finder_pack/properties.yaml b/duplicates_finder_pack/properties.yaml
@@ -3,5 +3,5 @@ icon: icon.png
 name: duplicates_finder
 type: uniqueness
 url: https://github.com/qalita-io/packs/tree/main/duplicates_finder_pack
-version: 1.1.16
+version: 1.1.19
 visibility: public
diff --git a/outlier_detection_pack/README.md b/outlier_detection_pack/README.md
@@ -2,8 +2,6 @@
 
 The Outlier Detection pack focuses on identifying and quantifying outliers within datasets, providing insights into the normality of data distributions. It utilizes the K-Nearest Neighbors (KNN) algorithm from the [PyOD library](https://pyod.readthedocs.io/) for outlier detection in both univariate (column-wise) and multivariate (dataset-wise) contexts.
 
-![Outlier Detection](https://pyod.readthedocs.io/en/latest/_images/ALL.png)
-
 ## Input 📥
 
 ### Configuration ⚙️
@@ -44,7 +42,7 @@ The pack generates a report containing the following insights:
 * **Univariate Outlier Detection**: A summary of the normality score for each numeric column, indicating the proportion of inliers in each column.
 * **Multivariate Outlier Detection**: A summary of the normality score for the entire dataset, indicating the proportion of inliers across the entire dataset.
 
-Filename is `outliers_report_{source_config["name"]}_{current_date}.xlsx`
+Filename is `{current_date}_outlier_detection_report_{source_config["name"]}.xlsx`
 
 # Contribute 💡
 

diff --git a/outlier_detection_pack/main.py b/outlier_detection_pack/main.py
@@ -182,7 +182,7 @@ def determine_recommendation_level(proportion_outliers):
         if item["value"] < normality_threshold:
             column_name = item["scope"]["value"]
             recommendation = {
-                "content": f"Column '{column_name}' has a normality score of {item['value']*100}%. Consider reviewing for outliers.",
+                "content": f"Column '{column_name}' has a normality score of {item['value']*100}%.",
                 "type": "Outliers",
                 "scope": {"perimeter": "column", "value": column_name},
                 "level": determine_recommendation_level(
@@ -201,7 +201,7 @@ def determine_recommendation_level(proportion_outliers):
     and dataset_normality_score < normality_threshold
 ):
     recommendation = {
-        "content": f"The dataset '{source_config['name']}' has a normality score of {dataset_normality_score*100}%. Consider reviewing for outliers.",
+        "content": f"The dataset '{source_config['name']}' has a normality score of {dataset_normality_score*100}%.",
         "type": "Outliers",
         "scope": {"perimeter": "dataset", "value": source_config["name"]},
         "level": determine_recommendation_level(
@@ -274,7 +274,7 @@ def determine_recommendation_level(proportion_outliers):
 if source_config['type'] == 'file':
     source_file_dir = os.path.dirname(source_config["config"]["path"])
     current_date = datetime.now().strftime("%Y%m%d")
-    excel_file_name = f"outliers_report_{source_config['name']}_{current_date}.xlsx"
+    excel_file_name = f"{current_date}_outlier_detection_report_{source_config['name']}.xlsx"
     excel_file_path = os.path.join(source_file_dir, excel_file_name)
 
     # Use this path in the ExcelWriter

diff --git a/outlier_detection_pack/properties.yaml b/outlier_detection_pack/properties.yaml
@@ -3,5 +3,5 @@ icon: icon.png
 name: outlier_detection
 type: reasonability
 url: https://github.com/qalita-io/packs/tree/main/outlier_detection_pack
-version: 1.1.28
+version: 1.1.30
 visibility: public