add precision recall pack + readme

qalita-io · Feb 5, 2024 · 40d3d82 · 40d3d82
1 parent 2426ee6
commit 40d3d82
Show file tree

Hide file tree

Showing 13 changed files with 486 additions and 3 deletions.
diff --git a/duplicates_finder_pack/README.md b/duplicates_finder_pack/README.md
@@ -31,7 +31,7 @@ This pack is compatible with **files** 📁 (``csv``, ``xslx``) and **databases*
 
 The report exports the duplicated data by adding the id column, and groupy by duplicates and sorting them.
 
-Filename is `duplicates_report_{source_config["name"]}_{current_date}.xlsx`
+Filename is `{current_date}_duplicates_finder_report_{source_config["name"]}_.xlsx`
 
 # Contribute 💡
 

diff --git a/outlier_detection_pack/README.md b/outlier_detection_pack/README.md
@@ -42,7 +42,7 @@ The pack generates a report containing the following insights:
 * **Univariate Outlier Detection**: A summary of the normality score for each numeric column, indicating the proportion of inliers in each column.
 * **Multivariate Outlier Detection**: A summary of the normality score for the entire dataset, indicating the proportion of inliers across the entire dataset.
 
-Filename is `outliers_report_{source_config["name"]}_{current_date}.xlsx`
+Filename is `{current_date}_outlier_detection_report_{source_config["name"]}.xlsx`
 
 # Contribute 💡
 

diff --git a/precision_recall_pack/LICENSE b/precision_recall_pack/LICENSE
@@ -0,0 +1,93 @@
+QALITA SOFTWARE LICENSE AGREEMENT
+
+THIS IS AN AGREEMENT BETWEEN YOU ("LICENSEE") AND QALITA SAS, A CORPORATION INCORPORATED UNDER THE LAWS OF FRANCE, COMPANY ID: 951 829 803 ("QALITA"). BY INSTALLING, COPYING, OR OTHERWISE USING THE QALITA SOFTWARE ("SOFTWARE"), YOU AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE AGREEMENT.
+
+    Copyright (c) - 2023-2024 - QALITA SAS - All Rights Reserved
+
+    1. GRANT OF LICENSE. Subject to the terms and conditions of this License Agreement, QALITA grants to Licensee a non-exclusive, non-transferable license to use the Software solely for Licensee's internal business purposes.
+
+    2. RESTRICTIONS. Licensee may not rent, lease, distribute, sublicense, transfer, or sell the Software, or any portion thereof. Licensee may not modify, translate, reverse engineer, decompile, disassemble, or create derivative works based on the Software, except to the extent that enforcement of the foregoing restriction is prohibited by applicable law.
+
+    3. COPYRIGHT AND OWNERSHIP. The Software is owned by QALITA and is protected by French copyright laws and international treaty provisions. QALITA retains all rights not expressly granted to Licensee in this License Agreement.
+
+    4. SOFTWARE DEPENDENCIES. The Software may include or depend on other software components which are licensed under terms and conditions different from this License Agreement. The licenses for these dependencies are included below, or in the documentation or files accompanying the Software.
+
+    5. NO WARRANTIES. The Software is provided "AS IS" and QALITA makes no warranty as to its use or performance. QALITA AND ITS SUPPLIERS DO NOT AND CANNOT WARRANT THE PERFORMANCE OR RESULTS YOU MAY OBTAIN BY USING THE SOFTWARE.
+
+    6. LIMITATION OF LIABILITY. In no event will QALITA or its suppliers be liable for any loss, damages or costs, whether direct, indirect, incidental, special, consequential, or punitive, arising out of Licensee's use of, or inability to use, the Software, even if QALITA has been advised of the possibility of such damages.
+
+    7. TERMINATION. QALITA may terminate this License Agreement if Licensee fails to comply with the terms and conditions of this License Agreement. In such event, Licensee must destroy all copies of the Software.
+
+    8. GENERAL. This License Agreement is governed by the laws of France. If any provision of this License Agreement is held to be void, invalid, unenforceable or illegal, the other provisions shall continue in full force and effect.
+
+BY INSTALLING, COPYING, OR OTHERWISE USING THE SOFTWARE, LICENSEE AGREES TO BE BOUND BY THE TERMS OF THIS LICENSE AGREEMENT.
+
+**QALITA SAS**
+
+**IMPORTANT:**
+
+BEFORE USING THIS SOFTWARE, CAREFULLY READ THIS LICENSE AGREEMENT. BY USING THE SOFTWARE, YOU ARE AGREEING TO BE BOUND BY THE TERMS OF THIS LICENSE AGREEMENT. IF YOU DO NOT AGREE TO THE TERMS OF THIS LICENSE AGREEMENT, DO NOT USE THE SOFTWARE.
+
+This is a legal agreement and should be treated as such. If you have any questions regarding this agreement, please contact QALITA at **[email protected].**
+
+
+
+____
+
+Dependency : [SQLAlchemy License](https://github.com/sqlalchemy/sqlalchemy/blob/main/LICENSE)
+
+    Copyright 2005-2024 SQLAlchemy authors and contributors <see AUTHORS file>.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy of
+    this software and associated documentation files (the "Software"), to deal in
+    the Software without restriction, including without limitation the rights to
+    use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is furnished to do
+    so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE.
+
+____
+
+Dependency : [Pandas License](https://github.com/pandas-dev/pandas/blob/main/LICENSE)
+
+    BSD 3-Clause License
+
+    Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
+    All rights reserved.
+
+    Copyright (c) 2011-2024, Open source contributors.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+    * Neither the name of the copyright holder nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/precision_recall_pack/README.md b/precision_recall_pack/README.md
@@ -0,0 +1,38 @@
+# Precision and Recall
+
+This pack computes the **precision** and **recall** scores for a dataset.
+
+* The ``precision`` and recall scores are used to evaluate the performance of a model. The precision score is the ratio of the number of true positive predictions to the number of true positive predictions plus the number of false positive predictions.
+* The ``recall`` score is the ratio of the number of true positive predictions to the number of true positive predictions plus the number of false negative predictions.
+
+## Input 📥
+
+### Configuration ⚙️
+
+| Name                                    | Type   | Required | Default | Description                                                    |
+| --------------------------------------- | ------ | -------- | ------- | -------------------------------------------------------------- |
+| `jobs.source.skiprows`                  | `int`  | no       | `0`     | The number of rows to skip at the beginning of the file.       |
+| `jobs.compute_precision_recall_columns` | `list` | no       | `[]`    | The list of columns to compute the PRECISION and RECALL score. |
+| `jobs.id_columns`                       | `list` | no       | `[]`    | The list of columns to use as identifier.                      |
+
+### Source type compatibility 🧩
+
+This pack is compatible with **files** 📁 (``csv``, ``xslx``) and **databases** 🖥️ (``MySQL``, ``PostgreSQL``).
+
+## Analysis 🕵️‍♂️
+
+| Name              | Description          | Scope   | Type    |
+| ----------------- | -------------------- | ------- | ------- |
+| `score`           | Duplication score    | Dataset | `float` |
+| `precision_score` | Precision score      | Dataset | `float` |
+| `recall_score`    | Recall score         | Dataset | `float` |
+
+## Output 📤
+
+### Report 📊
+
+Filename is `{current_date}_preicision_recall_report_{source_config["name"]}.xlsx`
+
+# Contribute 💡
+
+[This pack is part of Qalita Open Source Assets (QOSA) and is open to contribution. You can help us improve this pack by forking it and submitting a pull request here.](https://github.com/qalita-io/packs) 👥🚀
diff --git a/precision_recall_pack/icon.png b/precision_recall_pack/icon.png
diff --git a/precision_recall_pack/main.py b/precision_recall_pack/main.py
@@ -0,0 +1,30 @@
+"""
+Main file for pack
+"""
+import json
+import warnings
+
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+
+########################### Loading Data
+
+# Load the configuration file
+print("Load source_conf.json")
+with open("source_conf.json", "r", encoding="utf-8") as file:
+    source_config = json.load(file)
+
+# Load the pack configuration file
+print("Load pack_conf.json")
+with open("pack_conf.json", "r", encoding="utf-8") as file:
+    pack_config = json.load(file)
+
+# Load data using the opener.py logic
+from opener import load_data
+
+df = load_data(source_config, pack_config)
+
+############################ Metrics
+
+############################ Recommendations
+
+######################## Export:
diff --git a/precision_recall_pack/opener.py b/precision_recall_pack/opener.py
@@ -0,0 +1,134 @@
+"""
+The opener module contains functions to load data from files and databases.
+"""
+
+import os
+import glob
+import pandas as pd
+from sqlalchemy import create_engine
+
+# Mapping of default ports to database types
+DEFAULT_PORTS = {
+    "5432": "postgresql",
+    "3306": "mysql",
+    "1433": "mssql+pymssql",
+}
+
+
+def load_data_file(file_path, pack_config):
+    # Check if the outer keys exist
+    if "job" in pack_config and "source" in pack_config["job"]:
+        # Now safely check for 'skiprows'
+        skiprows = pack_config["job"]["source"].get("skiprows")
+
+        if skiprows is not None:  # Checking if 'skiprows' exists and is not None
+            if file_path.endswith(".csv"):
+                return pd.read_csv(
+                    file_path,
+                    low_memory=False,
+                    memory_map=True,
+                    skiprows=int(skiprows),
+                    on_bad_lines="warn",
+                    encoding="utf-8",
+                )
+            elif file_path.endswith(".xlsx"):
+                return pd.read_excel(
+                    file_path,
+                    engine="openpyxl",
+                    skiprows=int(skiprows),
+                )
+    else:
+        # Logic when 'skiprows' is not specified
+        if file_path.endswith(".csv"):
+            return pd.read_csv(
+                file_path,
+                low_memory=False,
+                memory_map=True,
+                on_bad_lines="warn",
+                encoding="utf-8",
+            )
+        elif file_path.endswith(".xlsx"):
+            return pd.read_excel(file_path, engine="openpyxl")
+
+
+# Function to create database connection
+def create_db_connection(config):
+    user = config["username"]
+    password = config["password"]
+    host = config["host"]
+    port = config["port"]
+    type = config["type"]
+    db = config["database"]
+
+    if type:
+        db_type = type
+    else:
+        # Deduce the database type from the port
+        db_type = DEFAULT_PORTS.get(port, "unknown")
+        if db_type == "unknown":
+            raise ValueError(f"Unsupported or unknown database port: {port}")
+
+    engine = create_engine(f"{db_type}://{user}:{password}@{host}:{port}/{db}")
+    return engine
+
+
+# Function to load data from database
+def load_data_from_db(engine):
+    with engine.connect() as connection:
+        # Check liveness
+        try:
+            connection.execute("SELECT 1")
+        except Exception as e:
+            raise ConnectionError(f"Database connection failed: {e}")
+
+        # Scan tables
+        tables = engine.table_names()
+        if not tables:
+            raise ValueError("No tables found in the database.")
+
+        # Load each table into a DataFrame and return them
+        dataframes = {}
+        for table in tables:
+            dataframes[table] = pd.read_sql_table(table, engine)
+
+        return dataframes
+
+
+# Function to load data based on the configuration
+def load_data(source_config, pack_config):
+    source_type = source_config["type"]
+
+    if source_type == "file":
+        path = source_config["config"]["path"]
+
+        if os.path.isfile(path):
+            if path.endswith(".csv") or path.endswith(".xlsx"):
+                return load_data_file(path, pack_config)
+            else:
+                raise ValueError(
+                    "Unsupported file type. Only CSV and XLSX are supported."
+                )
+        elif os.path.isdir(path):
+            data_files = glob.glob(os.path.join(path, "*.csv")) + glob.glob(
+                os.path.join(path, "*.xlsx")
+            )
+            if not data_files:
+                raise FileNotFoundError(
+                    "No CSV or XLSX files found in the provided path."
+                )
+            first_data_file = data_files[0]
+            return load_data_file(first_data_file, pack_config)
+        else:
+            raise FileNotFoundError(
+                f"The path {path} is neither a file nor a directory. Or can't be reached."
+            )
+
+    elif source_type == "database":
+        db_config = source_config["config"]
+        engine = create_db_connection(db_config)
+        return load_data_from_db(engine)
+
+    else:
+        raise ValueError(
+            "Unsupported source type. Only 'file' and 'database' are supported."
+        )
diff --git a/precision_recall_pack/pack_conf.json b/precision_recall_pack/pack_conf.json
@@ -0,0 +1,25 @@
+{
+    "job": {
+        "compute_precision_recall_columns": [],
+        "id_columns": [],
+        "source": {
+            "skiprows": 0
+        }
+    },
+    "charts": {
+        "overview": [
+            {
+                "metric_key": "score",
+                "chart_type": "text",
+                "display_title": true,
+                "justify": true
+            },
+            {
+                "metric_key": "duplicates",
+                "chart_type": "text",
+                "display_title": true,
+                "justify": true
+            }
+        ]
+    }
+}
diff --git a/precision_recall_pack/properties.yaml b/precision_recall_pack/properties.yaml
@@ -0,0 +1,7 @@
+description: Compute precision/recall scores between a source and a target
+icon: icon.png
+name: precision_recall
+type: consistency
+url: https://github.com/qalita-io/packs/tree/main/precision_recall_pack
+version: 1.0.0
+visibility: public
diff --git a/precision_recall_pack/pyproject.toml b/precision_recall_pack/pyproject.toml
@@ -0,0 +1,20 @@
+[tool.poetry]
+name = "precision-recall"
+version = "1.0.0"
+description = "Pack precision-recall Compute precision/recall scores between a source and a target"
+authors = ["qalita"]
+license = "Proprietary"
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = ">=3.10,<3.12"
+scikit-learn = "^0.24.2"
+matplotlib = "3.7.0"
+lxml = "^4.9.3"
+pandas = "2.0.3"
+openpyxl = "^3.1.2"
+sqlalchemy = "^2.0.23"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"