big update switch to qalita_core utility lib

qalita-io · Feb 22, 2024 · d6a2371 · d6a2371
1 parent 3bc6aac
commit d6a2371
Show file tree

Hide file tree

Showing 41 changed files with 568 additions and 1,997 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -58,18 +58,18 @@ jobs:
       - name: Run bandit
         run: bandit -lll -r .
 
-  # pytest:
-  #   runs-on: ubuntu-latest
-  #   needs: [bandit_check, secrets_scan, safety_check]
-  #   steps:
-  #     - uses: actions/checkout@v3
-  #     - name: Set up Python
-  #       uses: actions/setup-python@v4
-  #       with:
-  #         python-version: '3.11'
-  #     - name: Install dependencies
-  #       run: |
-  #         pip install poetry pytest
-  #         poetry install --no-root
-  #     - name: Run pytest
-  #       run: poetry run pytest tests/
+  pytest:
+    runs-on: ubuntu-latest
+    needs: [bandit_check, secrets_scan, safety_check]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      - name: Install dependencies
+        run: |
+          pip install poetry pytest
+          poetry install --no-root
+      - name: Run pytest
+        run: poetry run pytest tests/
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@ poetry.lock
 profiling_pack/source_conf.json
 docker-compose.yml
 docker-compose.yml
+/.pytest_cache/
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,9 @@
+{
+    "python.testing.pytestArgs": [
+        "tests"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true,
+    "python.analysis.typeCheckingMode": "basic",
+    "python.analysis.autoImportCompletions": false
+}
diff --git a/accuracy_pack/LICENSE b/accuracy_pack/LICENSE
@@ -28,65 +28,4 @@ BY INSTALLING, COPYING, OR OTHERWISE USING THE SOFTWARE, LICENSEE AGREES TO BE B
 
 BEFORE USING THIS SOFTWARE, CAREFULLY READ THIS LICENSE AGREEMENT. BY USING THE SOFTWARE, YOU ARE AGREEING TO BE BOUND BY THE TERMS OF THIS LICENSE AGREEMENT. IF YOU DO NOT AGREE TO THE TERMS OF THIS LICENSE AGREEMENT, DO NOT USE THE SOFTWARE.
 
-This is a legal agreement and should be treated as such. If you have any questions regarding this agreement, please contact QALITA at **[email protected].**
-
-
-____
-
-Dependency : [SQLAlchemy License](https://github.com/sqlalchemy/sqlalchemy/blob/main/LICENSE)
-
-    Copyright 2005-2024 SQLAlchemy authors and contributors <see AUTHORS file>.
-
-    Permission is hereby granted, free of charge, to any person obtaining a copy of
-    this software and associated documentation files (the "Software"), to deal in
-    the Software without restriction, including without limitation the rights to
-    use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-    of the Software, and to permit persons to whom the Software is furnished to do
-    so, subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included in all
-    copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-    SOFTWARE.
-
-____
-
-Dependency : [Pandas License](https://github.com/pandas-dev/pandas/blob/main/LICENSE)
-
-    BSD 3-Clause License
-
-    Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
-    All rights reserved.
-
-    Copyright (c) 2011-2024, Open source contributors.
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice, this
-    list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    this list of conditions and the following disclaimer in the documentation
-    and/or other materials provided with the distribution.
-
-    * Neither the name of the copyright holder nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+This is a legal agreement and should be treated as such. If you have any questions regarding this agreement, please contact QALITA at **[email protected].**
diff --git a/accuracy_pack/main.py b/accuracy_pack/main.py
@@ -1,148 +1,119 @@
-import json
-import utils
-import os
-from datetime import datetime
-
-########################### Loading Data
-
-# Load the configuration file
-print("Load source_conf.json")
-with open("source_conf.json", "r", encoding="utf-8") as file:
-    source_config = json.load(file)
-
-# Load the pack configuration file
-print("Load pack_conf.json")
-with open("pack_conf.json", "r", encoding="utf-8") as file:
-    pack_config = json.load(file)
-
-# Load data using the opener.py logic
-from opener import load_data
-
-df = load_data(source_config, pack_config)
-
-############################ Compute Precision Score for Each Float Column
-def compute_metrics(df):
-    float_columns = df.select_dtypes(include=["float", "float64"]).columns
-
-    # If there are no float columns, return None
-    if not float_columns.any():
-        print("No float columns found. metrics.json will not be created.")
-        return []
-
-    # Compute precision score for each float column
-    precision_data = []
-    total_proportion_score = 0  # Initialize total proportion score
-    valid_columns_count = 0  # Count of columns that have at least one non-NaN value
-
-    for column in float_columns:
-        column_data = df[column].dropna()
-
-        # Skip the column if it only contains NaN values
-        if column_data.empty:
-            continue
-
-        decimals_count = column_data.apply(lambda x: len(str(x).split(".")[1]) if "." in str(x) else 0)
-        max_decimals = decimals_count.max()
-        most_common_decimals_series = decimals_count.mode()
-
-        # Handle the scenario when the mode() returns an empty series
-        if most_common_decimals_series.empty:
-            print(f"No common decimal count found for column {column}.")
-            most_common_decimals = 0
-            proportion_score = 0
-        else:
-            most_common_decimals = most_common_decimals_series[0]  # Get the most common decimals count
-            proportion_score = decimals_count[decimals_count == most_common_decimals].count() / len(decimals_count)
-
-        total_proportion_score += proportion_score  # Add proportion score to the total
-        valid_columns_count += 1  # Increment valid columns count
-
-        if max_decimals > 0:
-            precision_data.append(
-                {
-                    "key": "decimal_precision",
-                    "value": str(max_decimals),  # Maximum number of decimals
-                    "scope": {"perimeter": "column", "value": column},
-                }
-            )
+from qalita_core.pack import Pack
+from qalita_core.utils import determine_recommendation_level
+
+pack = Pack()
+pack.load_data("source")
+
+float_columns = pack.df_source.select_dtypes(include=["float", "float64"]).columns
+
+# If there are no float columns, return None
+if not float_columns.any():
+    print("No float columns found. metrics.json will not be created.")
+    raise
+
+total_proportion_score = 0  # Initialize total proportion score
+valid_columns_count = 0  # Count of columns that have at least one non-NaN value
+
+for column in float_columns:
+    column_data = pack.df_source[column].dropna()
+
+    # Skip the column if it only contains NaN values
+    if column_data.empty:
+        continue
 
-        # Always include proportion_score in precision_data even if max_decimals is 0
-        precision_data.append(
+    decimals_count = column_data.apply(
+        lambda x: len(str(x).split(".")[1]) if "." in str(x) else 0
+    )
+    max_decimals = decimals_count.max()
+    most_common_decimals_series = decimals_count.mode()
+
+    # Handle the scenario when the mode() returns an empty series
+    if most_common_decimals_series.empty:
+        print(f"No common decimal count found for column {column}.")
+        most_common_decimals = 0
+        proportion_score = 0
+    else:
+        most_common_decimals = most_common_decimals_series[
+            0
+        ]  # Get the most common decimals count
+        proportion_score = decimals_count[
+            decimals_count == most_common_decimals
+        ].count() / len(decimals_count)
+
+    total_proportion_score += proportion_score  # Add proportion score to the total
+    valid_columns_count += 1  # Increment valid columns count
+
+    if max_decimals > 0:
+        pack.metrics.data.append(
             {
-                "key": "proportion_score",
-                "value": str(round(proportion_score, 2)),  # Proportion of values with the most common decimals count
+                "key": "decimal_precision",
+                "value": str(max_decimals),  # Maximum number of decimals
                 "scope": {"perimeter": "column", "value": column},
             }
         )
 
-    # Calculate the mean of proportion scores
-    mean_proportion_score = total_proportion_score / valid_columns_count if valid_columns_count > 0 else 0
-
-    # Add the mean proportion score to the precision data
-    precision_data.append(
+    # Always include proportion_score in pack.metrics.data even if max_decimals is 0
+    pack.metrics.data.append(
         {
-            "key": "score",
-            "value": str(round(mean_proportion_score, 2)),  # Mean proportion score
-            "scope": {"perimeter": "dataset", "value": source_config["name"]},
+            "key": "proportion_score",
+            "value": str(
+                round(proportion_score, 2)
+            ),  # Proportion of values with the most common decimals count
+            "scope": {"perimeter": "column", "value": column},
         }
     )
 
-    return precision_data
-
-
-# Compute metrics
-precision_metrics = compute_metrics(df)
-
-################### Recommendations
-recommendations = []
-for column in df.columns:
-    for item in precision_metrics:
+# Calculate the mean of proportion scores
+mean_proportion_score = (
+    total_proportion_score / valid_columns_count if valid_columns_count > 0 else 0
+)
+
+# Add the mean proportion score to the precision data
+pack.metrics.data.append(
+    {
+        "key": "score",
+        "value": str(round(mean_proportion_score, 2)),  # Mean proportion score
+        "scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
+    }
+)
+
+for column in pack.df_source.columns:
+    for item in pack.metrics.data:
         if item["scope"]["value"] == column and item["key"] == "proportion_score":
             proportion_score = float(item["value"])
             if proportion_score < 0.9:
                 recommendation = {
                     "content": f"Column '{column}' has {(1-proportion_score)*100:.2f}% of data that are not rounded to the same number of decimals.",
                     "type": "Unevenly Rounded Data",
                     "scope": {"perimeter": "column", "value": column},
-                    "level": utils.determine_recommendation_level(1 - proportion_score),
+                    "level": determine_recommendation_level(1 - proportion_score),
                 }
-                recommendations.append(recommendation)
+                pack.recommendations.data.append(recommendation)
 
-# Recommendation for the dataset
-if precision_metrics:
-    mean_proportion_score = float(precision_metrics[-1]["value"])
+if pack.metrics.data:
+    mean_proportion_score = float(pack.metrics.data[-1]["value"])
     if mean_proportion_score < 0.9:
         recommendation = {
             "content": f"The dataset has {(1-mean_proportion_score)*100:.2f}% of data that are not rounded to the same number of decimals.",
             "type": "Unevenly Rounded Data",
-            "scope": {"perimeter": "dataset", "value": source_config["name"]},
-            "level": utils.determine_recommendation_level(1 - mean_proportion_score),
+            "scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
+            "level": determine_recommendation_level(1 - mean_proportion_score),
         }
-        recommendations.append(recommendation)
-
-############################ Writing Metrics and Recommendations to Files
-
-if precision_metrics is not None:
-    with open("metrics.json", "w") as file:
-        json.dump(precision_metrics, file, indent=4)
-    print("metrics.json file created successfully.")
-
-if recommendations:
-    with open("recommendations.json", "w", encoding="utf-8") as f:
-        json.dump(recommendations, f, indent=4)
-    print("recommendations.json file created successfully.")
+        pack.recommendations.data.append(recommendation)
 
+pack.metrics.save()
+pack.recommendations.save()
 
 # ######################## Export:
 # # Step 1: Filter the DataFrame based on precision recommendations
 
 # id_columns = pack_config.get('job', {}).get('id_columns', [])
 
 # # For simplicity, let's assume that columns with a proportion score lower than 0.9 need attention
-# columns_to_check = [item["scope"]["value"] for item in precision_metrics if item["key"] == "proportion_score" and float(item["value"]) < 0.9]
+# columns_to_check = [item["scope"]["value"] for item in pack.metrics.data if item["key"] == "proportion_score" and float(item["value"]) < 0.9]
 
 # # Filter the DataFrame for rows that don't meet the rounding criteria in the specified columns
-# expected_precision = float(precision_metrics[1]["value"])
+# expected_precision = float(pack.metrics.data[1]["value"])
 # rows_with_rounding_issues = df[df[columns_to_check].applymap(lambda x: isinstance(x, float) and (len(str(x).split(".")[1]) if '.' in str(x) else 0) != expected_precision)]
 
 # # Check if there are rows with rounding issues