Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test/add testing cases #20

Merged
merged 2 commits into from
Feb 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ schemas.json
*_metrics.json
*_recommendations.json
*_schemas.json
*.log
46 changes: 28 additions & 18 deletions accuracy_pack/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,37 +11,37 @@
print("No float columns found. metrics.json will not be created.")
raise

total_proportion_score = 0 # Initialize total proportion score
total_proportion_score = 0 # Initialize total proportion score used for original mean
valid_columns_count = 0 # Count of columns that have at least one non-NaN value

float_total_proportion_score = 0 # Initialize total proportion score for float_mean
valid_points_count = 0 # Total count of valid data points (non-NaN) across all valid columns

for column in float_columns:
column_data = pack.df_source[column].dropna()

# Skip the column if it only contains NaN values

if column_data.empty:
continue


valid_data_points = len(column_data) # Number of non-NaN data points in the current column
decimals_count = column_data.apply(
lambda x: len(str(x).split(".")[1]) if "." in str(x) else 0
)
max_decimals = decimals_count.max()
most_common_decimals_series = decimals_count.mode()

# Handle the scenario when the mode() returns an empty series

if most_common_decimals_series.empty:
print(f"No common decimal count found for column {column}.")
most_common_decimals = 0
proportion_score = 0
else:
most_common_decimals = most_common_decimals_series[
0
] # Get the most common decimals count
proportion_score = decimals_count[
decimals_count == most_common_decimals
].count() / len(decimals_count)

total_proportion_score += proportion_score # Add proportion score to the total
most_common_decimals = most_common_decimals_series[0]
proportion_score = decimals_count[decimals_count == most_common_decimals].count() / valid_data_points

total_proportion_score += proportion_score # For original mean calculation
valid_columns_count += 1 # Increment valid columns count

# For float_mean calculation:
float_total_proportion_score += proportion_score * valid_data_points
valid_points_count += valid_data_points

if max_decimals > 0:
pack.metrics.data.append(
Expand All @@ -68,11 +68,21 @@
total_proportion_score / valid_columns_count if valid_columns_count > 0 else 0
)

# Add the mean proportion score to the precision data
# Calculate the float mean proportion score considering all data points accurately
float_mean_proportion_score = float_total_proportion_score / valid_points_count if valid_points_count > 0 else 0

pack.metrics.data.append(
{
"key": "float_score",
"value": str(round(float_mean_proportion_score, 2)),
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
}
)

pack.metrics.data.append(
{
"key": "score",
"value": str(round(mean_proportion_score, 2)), # Mean proportion score
"value": str(round(mean_proportion_score, 2)),
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
}
)
Expand Down
11 changes: 11 additions & 0 deletions data/hepatitis-c/source_conf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"config": {
"path": "../data/hepatitis-c/HepatitisCdata.csv"
},
"description": "Laboratory values of blood donors and Hepatitis C patients",
"name": "Hepatitis C Prediction Dataset",
"reference": false,
"sensitive": false,
"type": "file",
"visibility": "public"
}
2 changes: 1 addition & 1 deletion data_compare_pack/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@
pack.metrics.data.append(
{
"key": "score",
"value": score,
"value": str(round(score, 2)),
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
}
)
Expand Down
30 changes: 12 additions & 18 deletions duplicates_finder_pack/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
if (
"job" in pack.pack_config
and "compute_uniqueness_columns" in pack.pack_config["job"]
and len(pack.pack_config["job"]["compute_uniqueness_columns"]) > 0
):
uniqueness_columns = pack.pack_config["job"]["compute_uniqueness_columns"]
else:
Expand All @@ -19,11 +20,16 @@

# Step 1: Filter the DataFrame based on the specified columns
print("Columns used for checking duplicates:", uniqueness_columns)
df_subset = pack.df_source[uniqueness_columns]
df_subset = pack.df_source[uniqueness_columns].copy()
duplicates = df_subset.duplicated()
total_rows = len(pack.df_source)

print("total rows "+str(total_rows))

# Step 2: Calculate the number of duplicate rows based on this subset
total_duplicates = df_subset.duplicated().sum()
total_duplicates = duplicates.sum()

print("total duplicates "+str(total_duplicates))

# Calculate the scoped duplication score
duplication_score = round(total_duplicates / total_rows if total_rows > 0 else 0, 2)
Expand All @@ -35,7 +41,7 @@
pack.metrics.data.append(
{
"key": "score",
"value": score,
"value": str(round(score, 2)),
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
}
)
Expand Down Expand Up @@ -69,7 +75,7 @@
if score < 0.9:

recommendation = {
"content": f"dataset '{pack.source_config['name']}' has a duplication rate of {duplication_score*100}%. on the scope {uniqueness_columns} .",
"content": f"dataset '{pack.source_config['name']}' has a duplication rate of {duplication_score*100}%. on the scope {uniqueness_columns.to_list()} .",
"type": "Duplicates",
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
"level": determine_recommendation_level(duplication_score),
Expand All @@ -84,25 +90,13 @@
# Step 1: Retrieve 'id_columns' from pack_config
id_columns = pack.pack_config.get("job", {}).get("id_columns", [])

# Check if uniqueness_columns is empty and handle accordingly
if not uniqueness_columns:
print("No columns specified for checking duplicates. Using all columns.")
uniqueness_columns = (
pack.df_source.columns.tolist()
) # Use all columns if none are specified

# Step 2: Identify duplicated rows
duplicated_rows = pack.df_source[
pack.df_source.duplicated(subset=uniqueness_columns, keep=False)
]
duplicated_rows = pack.df_source[duplicates]

# Check if there are any duplicates
if duplicated_rows.empty:
if duplicates.empty:
print("No duplicates found. No report will be generated.")
else:
# If there are duplicates, proceed with sorting and exporting
duplicated_rows = duplicated_rows.sort_values(by=uniqueness_columns)

# Step 3: Set index or create 'index' column for the Excel export
if id_columns:
# Ensure all id_columns are in the DataFrame columns
Expand Down
2 changes: 1 addition & 1 deletion duplicates_finder_pack/properties.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ icon: icon.png
name: duplicates_finder
type: uniqueness
url: https://github.com/qalita-io/packs/tree/main/duplicates_finder_pack
version: 2.0.0
version: 2.0.1
visibility: public
106 changes: 81 additions & 25 deletions outlier_detection_pack/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
from qalita_core.pack import Pack
from qalita_core.utils import determine_recommendation_level


# Define a function to determine recommendation level based on the proportion of outliers
def determine_recommendation_level(proportion_outliers):
if proportion_outliers > 0.5: # More than 50% of data are outliers
return "high"
elif proportion_outliers > 0.3: # More than 30% of data are outliers
return "warning"
else:
return "info"


import os
import numpy as np
Expand All @@ -14,7 +24,9 @@
# Fill missing values with mean
for column in pack.df_source.columns:
if np.issubdtype(pack.df_source[column].dtype, np.number):
pack.df_source[column] = pack.df_source[column].fillna(pack.df_source[column].mean())
pack.df_source[column] = pack.df_source[column].fillna(
pack.df_source[column].mean()
)

# Identify columns that still contain NaN values after filling
columns_with_nan = [
Expand Down Expand Up @@ -65,7 +77,7 @@
)

# Identify outliers based on the inlier score and threshold
outliers = pack.df_source[[column]][inlier_score < outlier_threshold]
outliers = pack.df_source[[column]][inlier_score < outlier_threshold].copy()
univariate_outliers[column] = outliers

outlier_count = len(outliers)
Expand All @@ -77,6 +89,23 @@
}
)

if outlier_count > 0:
pack.recommendations.data.append(
{
"content": f"Column '{column}' has {outlier_count} outliers.",
"type": "Outliers",
"scope": {"perimeter": "column", "value": column},
"level": determine_recommendation_level(
outlier_count / len(pack.df_source[[column]])
),
}
)


total_univariate_outliers = sum(
len(outliers) for outliers in univariate_outliers.values()
)

# Identify non-numeric columns
non_numeric_columns = pack.df_source.select_dtypes(exclude=[np.number]).columns

Expand Down Expand Up @@ -129,13 +158,25 @@
pack.metrics.data.append(
{
"key": "score",
"value": round(inlier_score.mean().item(), 2),
"value": str(round(inlier_score.mean().item(), 2)),
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
}
)
except ValueError as e:
print(f"Error fitting the model: {e}")


total_multivariate_outliers = len(multivariate_outliers)
# total_outliers_count = total_univariate_outliers + total_multivariate_outliers
total_outliers_count = total_univariate_outliers
pack.metrics.data.append(
{
"key": "total_outliers_count",
"value": total_outliers_count,
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
}
)

pack.metrics.save()

# Define a threshold for considering a data point as an outlier
Expand Down Expand Up @@ -180,35 +221,60 @@
"content": f"The dataset '{pack.source_config['name']}' has a normality score of {dataset_normality_score*100}%.",
"type": "Outliers",
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
"level": determine_recommendation_level(
1 - dataset_normality_score
), # Convert percentage to proportion
"level": determine_recommendation_level(1 - dataset_normality_score),
}
pack.recommendations.data.append(recommendation)

pack.recommendations.data.append(
{
"content": f"The dataset '{pack.source_config['name']}' has a total of {total_outliers_count} outliers. Check them in output file.",
"type": "Outliers",
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
"level": determine_recommendation_level(
total_outliers_count / len(pack.df_source)
),
}
)

pack.recommendations.save()
####################### Export

# Step 1: Compile Univariate Outliers
all_univariate_outliers = pd.DataFrame()
for column, outliers in univariate_outliers.items():
# Select only the rows that are outliers, preserving the index
outliers_with_id = pack.df_source.loc[outliers.index, id_columns + [column]].copy()
# Create a new 'value' column to store the outlier values
outliers_with_id["value"] = outliers_with_id[column]
# Add column name as 'OutlierAttribute'
outliers_with_id["OutlierAttribute"] = column
outliers_with_id["index"] = outliers_with_id.index # Capture the original index
# Optionally, if you want to keep the original index as a column
outliers_with_id["index"] = outliers_with_id.index
# Drop the original value columns as we have captured it in 'value'
outliers_with_id = outliers_with_id[
id_columns + ["index", "OutlierAttribute", "value"]
]
# Concatenate to the all_univariate_outliers DataFrame
all_univariate_outliers = pd.concat(
[all_univariate_outliers, outliers_with_id], ignore_index=True
)

# Step 1: Compile Univariate Outliers
all_univariate_outliers_simple = pd.DataFrame()
for column, outliers in univariate_outliers.items():
outliers_with_id = pack.df_source.loc[outliers.index, id_columns + [column]].copy()
outliers_with_id["OutlierAttribute"] = column
outliers_with_id["index"] = outliers_with_id.index
all_univariate_outliers_simple = pd.concat(
[all_univariate_outliers_simple, outliers_with_id], ignore_index=True
)

# Rearrange columns for all_univariate_outliers
# Ensure the 'value' column is correctly placed if needed here
id_and_other_columns = (
["index"]
+ id_columns
+ ["OutlierAttribute"]
+ [
col
for col in all_univariate_outliers.columns
if col not in ["index"] + id_columns + ["OutlierAttribute"]
]
+ ["OutlierAttribute", "value"] # Include "value" in the list
)
all_univariate_outliers = all_univariate_outliers[id_and_other_columns]

Expand All @@ -234,7 +300,7 @@

# Step 3: Combine Data
all_outliers = pd.concat(
[all_univariate_outliers, multivariate_outliers], ignore_index=True
[all_univariate_outliers_simple, multivariate_outliers], ignore_index=True
)

# Ensure that all_outliers has the same column order
Expand All @@ -261,13 +327,3 @@
all_outliers.to_excel(writer, sheet_name="All Outliers", index=False)

print(f"Outliers report saved to {excel_file_path}")


# Define a function to determine recommendation level based on the proportion of outliers
def determine_recommendation_level(proportion_outliers):
if proportion_outliers > 0.5: # More than 20% of data are outliers
return "high"
elif proportion_outliers > 0.3: # More than 10% of data are outliers
return "warning"
else:
return "info"
8 changes: 7 additions & 1 deletion outlier_detection_pack/pack_conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
"chart_type": "text",
"display_title": true,
"justify": true
},
{
"metric_key": "total_outliers_count",
"chart_type": "text",
"display_title": true,
"justify": true
}
],
"scoped": [
Expand All @@ -37,4 +43,4 @@
}
]
}
}
}
2 changes: 1 addition & 1 deletion outlier_detection_pack/properties.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ icon: icon.png
name: outlier_detection
type: reasonability
url: https://github.com/qalita-io/packs/tree/main/outlier_detection_pack
version: 2.0.1
version: 2.0.2
visibility: public
Loading
Loading