diff --git a/accuracy_pack/main.py b/accuracy_pack/main.py index 2b61390..c56bc9a 100644 --- a/accuracy_pack/main.py +++ b/accuracy_pack/main.py @@ -11,37 +11,37 @@ print("No float columns found. metrics.json will not be created.") raise -total_proportion_score = 0 # Initialize total proportion score +total_proportion_score = 0 # Initialize total proportion score used for original mean valid_columns_count = 0 # Count of columns that have at least one non-NaN value +float_total_proportion_score = 0 # Initialize total proportion score for float_mean +valid_points_count = 0 # Total count of valid data points (non-NaN) across all valid columns + for column in float_columns: column_data = pack.df_source[column].dropna() - - # Skip the column if it only contains NaN values + if column_data.empty: continue - + + valid_data_points = len(column_data) # Number of non-NaN data points in the current column decimals_count = column_data.apply( lambda x: len(str(x).split(".")[1]) if "." in str(x) else 0 ) max_decimals = decimals_count.max() most_common_decimals_series = decimals_count.mode() - - # Handle the scenario when the mode() returns an empty series + if most_common_decimals_series.empty: - print(f"No common decimal count found for column {column}.") - most_common_decimals = 0 proportion_score = 0 else: - most_common_decimals = most_common_decimals_series[ - 0 - ] # Get the most common decimals count - proportion_score = decimals_count[ - decimals_count == most_common_decimals - ].count() / len(decimals_count) - - total_proportion_score += proportion_score # Add proportion score to the total + most_common_decimals = most_common_decimals_series[0] + proportion_score = decimals_count[decimals_count == most_common_decimals].count() / valid_data_points + + total_proportion_score += proportion_score # For original mean calculation valid_columns_count += 1 # Increment valid columns count + + # For float_mean calculation: + float_total_proportion_score += proportion_score * valid_data_points + valid_points_count += valid_data_points if max_decimals > 0: pack.metrics.data.append( @@ -68,11 +68,21 @@ total_proportion_score / valid_columns_count if valid_columns_count > 0 else 0 ) -# Add the mean proportion score to the precision data +# Calculate the float mean proportion score considering all data points accurately +float_mean_proportion_score = float_total_proportion_score / valid_points_count if valid_points_count > 0 else 0 + +pack.metrics.data.append( + { + "key": "float_score", + "value": str(round(float_mean_proportion_score, 2)), + "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, + } +) + pack.metrics.data.append( { "key": "score", - "value": str(round(mean_proportion_score, 2)), # Mean proportion score + "value": str(round(mean_proportion_score, 2)), "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, } ) diff --git a/data/hepatitis-c/source_conf.json b/data/hepatitis-c/source_conf.json new file mode 100644 index 0000000..6095ee0 --- /dev/null +++ b/data/hepatitis-c/source_conf.json @@ -0,0 +1,11 @@ +{ + "config": { + "path": "../data/hepatitis-c/HepatitisCdata.csv" + }, + "description": "Laboratory values of blood donors and Hepatitis C patients", + "name": "Hepatitis C Prediction Dataset", + "reference": false, + "sensitive": false, + "type": "file", + "visibility": "public" +} \ No newline at end of file diff --git a/data_compare_pack/main.py b/data_compare_pack/main.py index f0f58d2..3571edd 100644 --- a/data_compare_pack/main.py +++ b/data_compare_pack/main.py @@ -120,7 +120,7 @@ pack.metrics.data.append( { "key": "score", - "value": score, + "value": str(round(score, 2)), "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, } ) diff --git a/duplicates_finder_pack/main.py b/duplicates_finder_pack/main.py index 77e4b12..894b96b 100644 --- a/duplicates_finder_pack/main.py +++ b/duplicates_finder_pack/main.py @@ -10,6 +10,7 @@ if ( "job" in pack.pack_config and "compute_uniqueness_columns" in pack.pack_config["job"] + and len(pack.pack_config["job"]["compute_uniqueness_columns"]) > 0 ): uniqueness_columns = pack.pack_config["job"]["compute_uniqueness_columns"] else: @@ -19,11 +20,16 @@ # Step 1: Filter the DataFrame based on the specified columns print("Columns used for checking duplicates:", uniqueness_columns) -df_subset = pack.df_source[uniqueness_columns] +df_subset = pack.df_source[uniqueness_columns].copy() +duplicates = df_subset.duplicated() total_rows = len(pack.df_source) +print("total rows "+str(total_rows)) + # Step 2: Calculate the number of duplicate rows based on this subset -total_duplicates = df_subset.duplicated().sum() +total_duplicates = duplicates.sum() + +print("total duplicates "+str(total_duplicates)) # Calculate the scoped duplication score duplication_score = round(total_duplicates / total_rows if total_rows > 0 else 0, 2) @@ -35,7 +41,7 @@ pack.metrics.data.append( { "key": "score", - "value": score, + "value": str(round(score, 2)), "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, } ) @@ -69,7 +75,7 @@ if score < 0.9: recommendation = { - "content": f"dataset '{pack.source_config['name']}' has a duplication rate of {duplication_score*100}%. on the scope {uniqueness_columns} .", + "content": f"dataset '{pack.source_config['name']}' has a duplication rate of {duplication_score*100}%. on the scope {uniqueness_columns.to_list()} .", "type": "Duplicates", "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, "level": determine_recommendation_level(duplication_score), @@ -84,25 +90,13 @@ # Step 1: Retrieve 'id_columns' from pack_config id_columns = pack.pack_config.get("job", {}).get("id_columns", []) -# Check if uniqueness_columns is empty and handle accordingly -if not uniqueness_columns: - print("No columns specified for checking duplicates. Using all columns.") - uniqueness_columns = ( - pack.df_source.columns.tolist() - ) # Use all columns if none are specified - # Step 2: Identify duplicated rows -duplicated_rows = pack.df_source[ - pack.df_source.duplicated(subset=uniqueness_columns, keep=False) -] +duplicated_rows = pack.df_source[duplicates] # Check if there are any duplicates -if duplicated_rows.empty: +if duplicates.empty: print("No duplicates found. No report will be generated.") else: - # If there are duplicates, proceed with sorting and exporting - duplicated_rows = duplicated_rows.sort_values(by=uniqueness_columns) - # Step 3: Set index or create 'index' column for the Excel export if id_columns: # Ensure all id_columns are in the DataFrame columns diff --git a/duplicates_finder_pack/properties.yaml b/duplicates_finder_pack/properties.yaml index a9fe17c..0356dcd 100644 --- a/duplicates_finder_pack/properties.yaml +++ b/duplicates_finder_pack/properties.yaml @@ -3,5 +3,5 @@ icon: icon.png name: duplicates_finder type: uniqueness url: https://github.com/qalita-io/packs/tree/main/duplicates_finder_pack -version: 2.0.0 +version: 2.0.1 visibility: public diff --git a/outlier_detection_pack/main.py b/outlier_detection_pack/main.py index feaf6ad..dd583ff 100644 --- a/outlier_detection_pack/main.py +++ b/outlier_detection_pack/main.py @@ -1,5 +1,15 @@ from qalita_core.pack import Pack -from qalita_core.utils import determine_recommendation_level + + +# Define a function to determine recommendation level based on the proportion of outliers +def determine_recommendation_level(proportion_outliers): + if proportion_outliers > 0.5: # More than 50% of data are outliers + return "high" + elif proportion_outliers > 0.3: # More than 30% of data are outliers + return "warning" + else: + return "info" + import os import numpy as np @@ -14,7 +24,9 @@ # Fill missing values with mean for column in pack.df_source.columns: if np.issubdtype(pack.df_source[column].dtype, np.number): - pack.df_source[column] = pack.df_source[column].fillna(pack.df_source[column].mean()) + pack.df_source[column] = pack.df_source[column].fillna( + pack.df_source[column].mean() + ) # Identify columns that still contain NaN values after filling columns_with_nan = [ @@ -65,7 +77,7 @@ ) # Identify outliers based on the inlier score and threshold - outliers = pack.df_source[[column]][inlier_score < outlier_threshold] + outliers = pack.df_source[[column]][inlier_score < outlier_threshold].copy() univariate_outliers[column] = outliers outlier_count = len(outliers) @@ -77,6 +89,23 @@ } ) + if outlier_count > 0: + pack.recommendations.data.append( + { + "content": f"Column '{column}' has {outlier_count} outliers.", + "type": "Outliers", + "scope": {"perimeter": "column", "value": column}, + "level": determine_recommendation_level( + outlier_count / len(pack.df_source[[column]]) + ), + } + ) + + +total_univariate_outliers = sum( + len(outliers) for outliers in univariate_outliers.values() +) + # Identify non-numeric columns non_numeric_columns = pack.df_source.select_dtypes(exclude=[np.number]).columns @@ -129,13 +158,25 @@ pack.metrics.data.append( { "key": "score", - "value": round(inlier_score.mean().item(), 2), + "value": str(round(inlier_score.mean().item(), 2)), "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, } ) except ValueError as e: print(f"Error fitting the model: {e}") + +total_multivariate_outliers = len(multivariate_outliers) +# total_outliers_count = total_univariate_outliers + total_multivariate_outliers +total_outliers_count = total_univariate_outliers +pack.metrics.data.append( + { + "key": "total_outliers_count", + "value": total_outliers_count, + "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, + } +) + pack.metrics.save() # Define a threshold for considering a data point as an outlier @@ -180,35 +221,60 @@ "content": f"The dataset '{pack.source_config['name']}' has a normality score of {dataset_normality_score*100}%.", "type": "Outliers", "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, - "level": determine_recommendation_level( - 1 - dataset_normality_score - ), # Convert percentage to proportion + "level": determine_recommendation_level(1 - dataset_normality_score), } pack.recommendations.data.append(recommendation) +pack.recommendations.data.append( + { + "content": f"The dataset '{pack.source_config['name']}' has a total of {total_outliers_count} outliers. Check them in output file.", + "type": "Outliers", + "scope": {"perimeter": "dataset", "value": pack.source_config["name"]}, + "level": determine_recommendation_level( + total_outliers_count / len(pack.df_source) + ), + } +) + pack.recommendations.save() ####################### Export # Step 1: Compile Univariate Outliers all_univariate_outliers = pd.DataFrame() for column, outliers in univariate_outliers.items(): + # Select only the rows that are outliers, preserving the index outliers_with_id = pack.df_source.loc[outliers.index, id_columns + [column]].copy() + # Create a new 'value' column to store the outlier values + outliers_with_id["value"] = outliers_with_id[column] + # Add column name as 'OutlierAttribute' outliers_with_id["OutlierAttribute"] = column - outliers_with_id["index"] = outliers_with_id.index # Capture the original index + # Optionally, if you want to keep the original index as a column + outliers_with_id["index"] = outliers_with_id.index + # Drop the original value columns as we have captured it in 'value' + outliers_with_id = outliers_with_id[ + id_columns + ["index", "OutlierAttribute", "value"] + ] + # Concatenate to the all_univariate_outliers DataFrame all_univariate_outliers = pd.concat( [all_univariate_outliers, outliers_with_id], ignore_index=True ) +# Step 1: Compile Univariate Outliers +all_univariate_outliers_simple = pd.DataFrame() +for column, outliers in univariate_outliers.items(): + outliers_with_id = pack.df_source.loc[outliers.index, id_columns + [column]].copy() + outliers_with_id["OutlierAttribute"] = column + outliers_with_id["index"] = outliers_with_id.index + all_univariate_outliers_simple = pd.concat( + [all_univariate_outliers_simple, outliers_with_id], ignore_index=True + ) + # Rearrange columns for all_univariate_outliers +# Ensure the 'value' column is correctly placed if needed here id_and_other_columns = ( ["index"] + id_columns - + ["OutlierAttribute"] - + [ - col - for col in all_univariate_outliers.columns - if col not in ["index"] + id_columns + ["OutlierAttribute"] - ] + + ["OutlierAttribute", "value"] # Include "value" in the list ) all_univariate_outliers = all_univariate_outliers[id_and_other_columns] @@ -234,7 +300,7 @@ # Step 3: Combine Data all_outliers = pd.concat( - [all_univariate_outliers, multivariate_outliers], ignore_index=True + [all_univariate_outliers_simple, multivariate_outliers], ignore_index=True ) # Ensure that all_outliers has the same column order @@ -261,13 +327,3 @@ all_outliers.to_excel(writer, sheet_name="All Outliers", index=False) print(f"Outliers report saved to {excel_file_path}") - - -# Define a function to determine recommendation level based on the proportion of outliers -def determine_recommendation_level(proportion_outliers): - if proportion_outliers > 0.5: # More than 20% of data are outliers - return "high" - elif proportion_outliers > 0.3: # More than 10% of data are outliers - return "warning" - else: - return "info" diff --git a/outlier_detection_pack/pack_conf.json b/outlier_detection_pack/pack_conf.json index fc4c1f6..06ecdae 100644 --- a/outlier_detection_pack/pack_conf.json +++ b/outlier_detection_pack/pack_conf.json @@ -14,6 +14,12 @@ "chart_type": "text", "display_title": true, "justify": true + }, + { + "metric_key": "total_outliers_count", + "chart_type": "text", + "display_title": true, + "justify": true } ], "scoped": [ @@ -37,4 +43,4 @@ } ] } -} +} \ No newline at end of file diff --git a/outlier_detection_pack/properties.yaml b/outlier_detection_pack/properties.yaml index a9b158c..c018b2e 100644 --- a/outlier_detection_pack/properties.yaml +++ b/outlier_detection_pack/properties.yaml @@ -3,5 +3,5 @@ icon: icon.png name: outlier_detection type: reasonability url: https://github.com/qalita-io/packs/tree/main/outlier_detection_pack -version: 2.0.1 +version: 2.0.2 visibility: public diff --git a/profiling_pack/main.py b/profiling_pack/main.py index 5ce0daa..f6df764 100644 --- a/profiling_pack/main.py +++ b/profiling_pack/main.py @@ -251,6 +251,14 @@ # } # ) +################## Remove unwanted metrics or recommendations + +unwanted_keys = ["histogram"] +pack.metrics.data = [item for item in pack.metrics.data if item.get("key") not in unwanted_keys] + +unwanted_keys = ["Unsupported"] +pack.recommendations.data = [item for item in pack.recommendations.data if item.get("type") not in unwanted_keys] + pack.metrics.save() pack.recommendations.save() pack.schemas.save() diff --git a/profiling_pack/properties.yaml b/profiling_pack/properties.yaml index 53589e6..6bd6823 100644 --- a/profiling_pack/properties.yaml +++ b/profiling_pack/properties.yaml @@ -3,5 +3,5 @@ icon: icon.png name: profiling type: completeness url: https://github.com/qalita-io/packs/tree/main/profiling_pack -version: 2.0.1 +version: 2.0.2 visibility: public diff --git a/tests/test_one_pack.sh b/tests/test_one_pack.sh old mode 100644 new mode 100755 index 810ac4c..ca0f74e --- a/tests/test_one_pack.sh +++ b/tests/test_one_pack.sh @@ -17,9 +17,10 @@ current_datetime=$(date +"%Y%m%d") process_data_pack() { local dataset="$1" local pack="$2" - # Define log file path - mkdir -p "${DATA_DIR}/${dataset}/output/${current_datetime}" - log_file="${DATA_DIR}/${dataset}/output/${current_datetime}/${dataset}_${pack}.log" + # Correctly determine dataset directory + local dataset_dir="${DATA_DIR}/${dataset}" + mkdir -p "${dataset_dir}/output/${current_datetime}" + log_file="${dataset_dir}/output/${current_datetime}/${dataset}_${pack}.log" # Redirect the outputs of this iteration to the log file { echo "Processing Dataset: ${dataset} with Test Pack: ${pack}" @@ -71,11 +72,13 @@ if [ "$#" -lt 1 ]; then fi PACK=$1 +ROOT_DIR=$(pwd) +DATA_DIR="${ROOT_DIR}/data" + +# Check if the second argument is provided (non-empty) if [ -n "$2" ]; then DATASETS=("$2") else - ROOT_DIR=$(pwd) - DATA_DIR="${ROOT_DIR}/data" generate_test_datasets "${DATA_DIR}" DATASETS=("${TEST_DATASETS[@]}") fi