From 80920eb3ec86f862a1c3899123dee066022d9d98 Mon Sep 17 00:00:00 2001 From: Armand LEOPOLD Date: Fri, 23 Feb 2024 12:14:22 +0100 Subject: [PATCH 1/2] fixes --- profiling_pack/main.py | 16 ++++++++++------ profiling_pack/properties.yaml | 2 +- schema_scanner_pack/main.py | 4 +++- schema_scanner_pack/properties.yaml | 2 +- schema_scanner_pack/pyproject.toml | 1 - timeliness_pack/properties.yaml | 2 +- timeliness_pack/pyproject.toml | 1 - versioning_pack/properties.yaml | 2 +- versioning_pack/pyproject.toml | 1 - 9 files changed, 17 insertions(+), 14 deletions(-) diff --git a/profiling_pack/main.py b/profiling_pack/main.py index 84c5a71..5ce0daa 100644 --- a/profiling_pack/main.py +++ b/profiling_pack/main.py @@ -10,6 +10,7 @@ import os from ydata_profiling import ProfileReport from datetime import datetime +from io import StringIO pack = Pack() pack.load_data("source") @@ -22,7 +23,9 @@ # Run the profiling report profile = ProfileReport( - pack.df_source, title=f"Profiling Report for {dataset_scope_name}" + pack.df_source, + title=f"Profiling Report for {dataset_scope_name}", + correlations={"auto": {"calculate": False}}, ) # Save the report to HTML @@ -47,7 +50,8 @@ try: with open(html_file_name, "r", encoding="utf-8") as f: - tables = pd.read_html(f.read()) + html_content = f.read() + tables = pd.read_html(StringIO(html_content)) except ValueError as e: print(f"No tables found in the HTML report: {e}") tables = [pd.DataFrame()] # Create an empty DataFrame if no tables are found @@ -121,9 +125,9 @@ # Extract p_cells_missing value (as a decimal) df_missing = pd.DataFrame(pack.metrics.data) -p_cells_missing_value = df_missing[ - df_missing["key"] == "p_cells_missing" -]["value"].values[0] +p_cells_missing_value = df_missing[df_missing["key"] == "p_cells_missing"][ + "value" +].values[0] p_cells_missing = float(p_cells_missing_value) # Calculate the score @@ -180,7 +184,7 @@ print("No alerts table found in the HTML report.") alerts_data = pd.DataFrame() # Create an empty DataFrame if no alerts are found -alerts_list_of_dicts = alerts_data.to_dict(orient='records') +alerts_list_of_dicts = alerts_data.to_dict(orient="records") pack.recommendations.data = alerts_list_of_dicts ############################ Schemas diff --git a/profiling_pack/properties.yaml b/profiling_pack/properties.yaml index c25cc28..53589e6 100644 --- a/profiling_pack/properties.yaml +++ b/profiling_pack/properties.yaml @@ -3,5 +3,5 @@ icon: icon.png name: profiling type: completeness url: https://github.com/qalita-io/packs/tree/main/profiling_pack -version: 2.0.0 +version: 2.0.1 visibility: public diff --git a/schema_scanner_pack/main.py b/schema_scanner_pack/main.py index 573cbeb..ebb4693 100644 --- a/schema_scanner_pack/main.py +++ b/schema_scanner_pack/main.py @@ -2,6 +2,7 @@ import pandas as pd from ydata_profiling import ProfileReport from qalita_core.pack import Pack +from io import StringIO pack = Pack() pack.load_data("source") @@ -24,7 +25,8 @@ try: with open(html_file_name, "r", encoding="utf-8") as f: - tables = pd.read_html(f.read()) + html_content = f.read() + tables = pd.read_html(StringIO(html_content)) except ValueError as e: print(f"No tables found in the HTML report: {e}") tables = [pd.DataFrame()] # Create an empty DataFrame if no tables are found diff --git a/schema_scanner_pack/properties.yaml b/schema_scanner_pack/properties.yaml index d676082..754a491 100644 --- a/schema_scanner_pack/properties.yaml +++ b/schema_scanner_pack/properties.yaml @@ -3,5 +3,5 @@ icon: icon.png name: schema_scanner type: schema url: https://github.com/qalita-io/packs/tree/main/schema_scanner_pack -version: 2.0.0 +version: 2.0.2 visibility: public diff --git a/schema_scanner_pack/pyproject.toml b/schema_scanner_pack/pyproject.toml index b9a6a9f..85dd842 100644 --- a/schema_scanner_pack/pyproject.toml +++ b/schema_scanner_pack/pyproject.toml @@ -11,7 +11,6 @@ python = ">=3.10,<3.12" ydata-profiling = "^4.6.0" matplotlib = "3.7.0" lxml = "^4.9.3" -pandas = "2.0.3" openpyxl = "^3.1.2" sqlalchemy = "^2.0.23" html5lib = "^1.1" diff --git a/timeliness_pack/properties.yaml b/timeliness_pack/properties.yaml index bf321df..d092f7f 100644 --- a/timeliness_pack/properties.yaml +++ b/timeliness_pack/properties.yaml @@ -3,5 +3,5 @@ icon: icon.png name: timeliness type: timeliness url: https://github.com/qalita-io/packs/tree/main/timeliness_pack -version: 2.0.0 +version: 2.0.1 visibility: public diff --git a/timeliness_pack/pyproject.toml b/timeliness_pack/pyproject.toml index 19f93c3..3646934 100644 --- a/timeliness_pack/pyproject.toml +++ b/timeliness_pack/pyproject.toml @@ -10,7 +10,6 @@ readme = "README.md" python = ">=3.10,<3.12" matplotlib = "3.7.0" lxml = "^4.9.3" -pandas = "2.0.3" openpyxl = "^3.1.2" sqlalchemy = "^2.0.23" qalita-core = "^0.2.0" diff --git a/versioning_pack/properties.yaml b/versioning_pack/properties.yaml index 2b46f7d..9554b08 100644 --- a/versioning_pack/properties.yaml +++ b/versioning_pack/properties.yaml @@ -3,5 +3,5 @@ icon: icon.png name: versioning type: version url: https://github.com/qalita-io/packs/tree/main/versioning_pack -version: 2.0.0 +version: 2.0.1 visibility: public diff --git a/versioning_pack/pyproject.toml b/versioning_pack/pyproject.toml index 14d3ee8..0bcee9c 100644 --- a/versioning_pack/pyproject.toml +++ b/versioning_pack/pyproject.toml @@ -11,7 +11,6 @@ requests = "^2.31.0" python = ">=3.10,<3.12" matplotlib = "3.7.0" lxml = "^4.9.3" -pandas = "2.0.3" openpyxl = "^3.1.2" sqlalchemy = "^2.0.23" qalita-core = "^0.2.0" From 7f247a4a08910c4b1cf46966a02162b8ba18dec4 Mon Sep 17 00:00:00 2001 From: Armand LEOPOLD Date: Fri, 23 Feb 2024 12:45:19 +0100 Subject: [PATCH 2/2] fix outlier --- outlier_detection_pack/main.py | 2 +- outlier_detection_pack/properties.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/outlier_detection_pack/main.py b/outlier_detection_pack/main.py index 9bb594d..feaf6ad 100644 --- a/outlier_detection_pack/main.py +++ b/outlier_detection_pack/main.py @@ -94,7 +94,7 @@ df = pd.concat([df, encoded_df.reset_index(drop=True)], axis=1) # Exclude id_columns from df before Multivariate Outlier Detection -df_for_multivariate = pack.df_source.drop(columns=id_columns) +df_for_multivariate = df.drop(columns=id_columns) # Multivariate Outlier Detection multivariate_outliers = pd.DataFrame() diff --git a/outlier_detection_pack/properties.yaml b/outlier_detection_pack/properties.yaml index 29b1bc2..a9b158c 100644 --- a/outlier_detection_pack/properties.yaml +++ b/outlier_detection_pack/properties.yaml @@ -3,5 +3,5 @@ icon: icon.png name: outlier_detection type: reasonability url: https://github.com/qalita-io/packs/tree/main/outlier_detection_pack -version: 2.0.0 +version: 2.0.1 visibility: public