Skip to content

Commit

Permalink
meilleur calcul accuracy, roundind data compare score, better scoping…
Browse files Browse the repository at this point in the history
… duplicates, improving export presentation outlier_detections, prevent exporting data from profiling and silence unsupported type recommendation
  • Loading branch information
armandleopold committed Feb 25, 2024
1 parent c06b067 commit 5cf61b9
Show file tree
Hide file tree
Showing 11 changed files with 159 additions and 71 deletions.
46 changes: 28 additions & 18 deletions accuracy_pack/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,37 +11,37 @@
print("No float columns found. metrics.json will not be created.")
raise

total_proportion_score = 0 # Initialize total proportion score
total_proportion_score = 0 # Initialize total proportion score used for original mean
valid_columns_count = 0 # Count of columns that have at least one non-NaN value

float_total_proportion_score = 0 # Initialize total proportion score for float_mean
valid_points_count = 0 # Total count of valid data points (non-NaN) across all valid columns

for column in float_columns:
column_data = pack.df_source[column].dropna()

# Skip the column if it only contains NaN values

if column_data.empty:
continue


valid_data_points = len(column_data) # Number of non-NaN data points in the current column
decimals_count = column_data.apply(
lambda x: len(str(x).split(".")[1]) if "." in str(x) else 0
)
max_decimals = decimals_count.max()
most_common_decimals_series = decimals_count.mode()

# Handle the scenario when the mode() returns an empty series

if most_common_decimals_series.empty:
print(f"No common decimal count found for column {column}.")
most_common_decimals = 0
proportion_score = 0
else:
most_common_decimals = most_common_decimals_series[
0
] # Get the most common decimals count
proportion_score = decimals_count[
decimals_count == most_common_decimals
].count() / len(decimals_count)

total_proportion_score += proportion_score # Add proportion score to the total
most_common_decimals = most_common_decimals_series[0]
proportion_score = decimals_count[decimals_count == most_common_decimals].count() / valid_data_points

total_proportion_score += proportion_score # For original mean calculation
valid_columns_count += 1 # Increment valid columns count

# For float_mean calculation:
float_total_proportion_score += proportion_score * valid_data_points
valid_points_count += valid_data_points

if max_decimals > 0:
pack.metrics.data.append(
Expand All @@ -68,11 +68,21 @@
total_proportion_score / valid_columns_count if valid_columns_count > 0 else 0
)

# Add the mean proportion score to the precision data
# Calculate the float mean proportion score considering all data points accurately
float_mean_proportion_score = float_total_proportion_score / valid_points_count if valid_points_count > 0 else 0

pack.metrics.data.append(
{
"key": "float_score",
"value": str(round(float_mean_proportion_score, 2)),
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
}
)

pack.metrics.data.append(
{
"key": "score",
"value": str(round(mean_proportion_score, 2)), # Mean proportion score
"value": str(round(mean_proportion_score, 2)),
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
}
)
Expand Down
11 changes: 11 additions & 0 deletions data/hepatitis-c/source_conf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"config": {
"path": "../data/hepatitis-c/HepatitisCdata.csv"
},
"description": "Laboratory values of blood donors and Hepatitis C patients",
"name": "Hepatitis C Prediction Dataset",
"reference": false,
"sensitive": false,
"type": "file",
"visibility": "public"
}
2 changes: 1 addition & 1 deletion data_compare_pack/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@
pack.metrics.data.append(
{
"key": "score",
"value": score,
"value": str(round(score, 2)),
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
}
)
Expand Down
30 changes: 12 additions & 18 deletions duplicates_finder_pack/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
if (
"job" in pack.pack_config
and "compute_uniqueness_columns" in pack.pack_config["job"]
and len(pack.pack_config["job"]["compute_uniqueness_columns"]) > 0
):
uniqueness_columns = pack.pack_config["job"]["compute_uniqueness_columns"]
else:
Expand All @@ -19,11 +20,16 @@

# Step 1: Filter the DataFrame based on the specified columns
print("Columns used for checking duplicates:", uniqueness_columns)
df_subset = pack.df_source[uniqueness_columns]
df_subset = pack.df_source[uniqueness_columns].copy()
duplicates = df_subset.duplicated()
total_rows = len(pack.df_source)

print("total rows "+str(total_rows))

# Step 2: Calculate the number of duplicate rows based on this subset
total_duplicates = df_subset.duplicated().sum()
total_duplicates = duplicates.sum()

print("total duplicates "+str(total_duplicates))

# Calculate the scoped duplication score
duplication_score = round(total_duplicates / total_rows if total_rows > 0 else 0, 2)
Expand All @@ -35,7 +41,7 @@
pack.metrics.data.append(
{
"key": "score",
"value": score,
"value": str(round(score, 2)),
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
}
)
Expand Down Expand Up @@ -69,7 +75,7 @@
if score < 0.9:

recommendation = {
"content": f"dataset '{pack.source_config['name']}' has a duplication rate of {duplication_score*100}%. on the scope {uniqueness_columns} .",
"content": f"dataset '{pack.source_config['name']}' has a duplication rate of {duplication_score*100}%. on the scope {uniqueness_columns.to_list()} .",
"type": "Duplicates",
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
"level": determine_recommendation_level(duplication_score),
Expand All @@ -84,25 +90,13 @@
# Step 1: Retrieve 'id_columns' from pack_config
id_columns = pack.pack_config.get("job", {}).get("id_columns", [])

# Check if uniqueness_columns is empty and handle accordingly
if not uniqueness_columns:
print("No columns specified for checking duplicates. Using all columns.")
uniqueness_columns = (
pack.df_source.columns.tolist()
) # Use all columns if none are specified

# Step 2: Identify duplicated rows
duplicated_rows = pack.df_source[
pack.df_source.duplicated(subset=uniqueness_columns, keep=False)
]
duplicated_rows = pack.df_source[duplicates]

# Check if there are any duplicates
if duplicated_rows.empty:
if duplicates.empty:
print("No duplicates found. No report will be generated.")
else:
# If there are duplicates, proceed with sorting and exporting
duplicated_rows = duplicated_rows.sort_values(by=uniqueness_columns)

# Step 3: Set index or create 'index' column for the Excel export
if id_columns:
# Ensure all id_columns are in the DataFrame columns
Expand Down
2 changes: 1 addition & 1 deletion duplicates_finder_pack/properties.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ icon: icon.png
name: duplicates_finder
type: uniqueness
url: https://github.com/qalita-io/packs/tree/main/duplicates_finder_pack
version: 2.0.0
version: 2.0.1
visibility: public
106 changes: 81 additions & 25 deletions outlier_detection_pack/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
from qalita_core.pack import Pack
from qalita_core.utils import determine_recommendation_level


# Define a function to determine recommendation level based on the proportion of outliers
def determine_recommendation_level(proportion_outliers):
if proportion_outliers > 0.5: # More than 50% of data are outliers
return "high"
elif proportion_outliers > 0.3: # More than 30% of data are outliers
return "warning"
else:
return "info"


import os
import numpy as np
Expand All @@ -14,7 +24,9 @@
# Fill missing values with mean
for column in pack.df_source.columns:
if np.issubdtype(pack.df_source[column].dtype, np.number):
pack.df_source[column] = pack.df_source[column].fillna(pack.df_source[column].mean())
pack.df_source[column] = pack.df_source[column].fillna(
pack.df_source[column].mean()
)

# Identify columns that still contain NaN values after filling
columns_with_nan = [
Expand Down Expand Up @@ -65,7 +77,7 @@
)

# Identify outliers based on the inlier score and threshold
outliers = pack.df_source[[column]][inlier_score < outlier_threshold]
outliers = pack.df_source[[column]][inlier_score < outlier_threshold].copy()
univariate_outliers[column] = outliers

outlier_count = len(outliers)
Expand All @@ -77,6 +89,23 @@
}
)

if outlier_count > 0:
pack.recommendations.data.append(
{
"content": f"Column '{column}' has {outlier_count} outliers.",
"type": "Outliers",
"scope": {"perimeter": "column", "value": column},
"level": determine_recommendation_level(
outlier_count / len(pack.df_source[[column]])
),
}
)


total_univariate_outliers = sum(
len(outliers) for outliers in univariate_outliers.values()
)

# Identify non-numeric columns
non_numeric_columns = pack.df_source.select_dtypes(exclude=[np.number]).columns

Expand Down Expand Up @@ -129,13 +158,25 @@
pack.metrics.data.append(
{
"key": "score",
"value": round(inlier_score.mean().item(), 2),
"value": str(round(inlier_score.mean().item(), 2)),
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
}
)
except ValueError as e:
print(f"Error fitting the model: {e}")


total_multivariate_outliers = len(multivariate_outliers)
# total_outliers_count = total_univariate_outliers + total_multivariate_outliers
total_outliers_count = total_univariate_outliers
pack.metrics.data.append(
{
"key": "total_outliers_count",
"value": total_outliers_count,
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
}
)

pack.metrics.save()

# Define a threshold for considering a data point as an outlier
Expand Down Expand Up @@ -180,35 +221,60 @@
"content": f"The dataset '{pack.source_config['name']}' has a normality score of {dataset_normality_score*100}%.",
"type": "Outliers",
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
"level": determine_recommendation_level(
1 - dataset_normality_score
), # Convert percentage to proportion
"level": determine_recommendation_level(1 - dataset_normality_score),
}
pack.recommendations.data.append(recommendation)

pack.recommendations.data.append(
{
"content": f"The dataset '{pack.source_config['name']}' has a total of {total_outliers_count} outliers. Check them in output file.",
"type": "Outliers",
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
"level": determine_recommendation_level(
total_outliers_count / len(pack.df_source)
),
}
)

pack.recommendations.save()
####################### Export

# Step 1: Compile Univariate Outliers
all_univariate_outliers = pd.DataFrame()
for column, outliers in univariate_outliers.items():
# Select only the rows that are outliers, preserving the index
outliers_with_id = pack.df_source.loc[outliers.index, id_columns + [column]].copy()
# Create a new 'value' column to store the outlier values
outliers_with_id["value"] = outliers_with_id[column]
# Add column name as 'OutlierAttribute'
outliers_with_id["OutlierAttribute"] = column
outliers_with_id["index"] = outliers_with_id.index # Capture the original index
# Optionally, if you want to keep the original index as a column
outliers_with_id["index"] = outliers_with_id.index
# Drop the original value columns as we have captured it in 'value'
outliers_with_id = outliers_with_id[
id_columns + ["index", "OutlierAttribute", "value"]
]
# Concatenate to the all_univariate_outliers DataFrame
all_univariate_outliers = pd.concat(
[all_univariate_outliers, outliers_with_id], ignore_index=True
)

# Step 1: Compile Univariate Outliers
all_univariate_outliers_simple = pd.DataFrame()
for column, outliers in univariate_outliers.items():
outliers_with_id = pack.df_source.loc[outliers.index, id_columns + [column]].copy()
outliers_with_id["OutlierAttribute"] = column
outliers_with_id["index"] = outliers_with_id.index
all_univariate_outliers_simple = pd.concat(
[all_univariate_outliers_simple, outliers_with_id], ignore_index=True
)

# Rearrange columns for all_univariate_outliers
# Ensure the 'value' column is correctly placed if needed here
id_and_other_columns = (
["index"]
+ id_columns
+ ["OutlierAttribute"]
+ [
col
for col in all_univariate_outliers.columns
if col not in ["index"] + id_columns + ["OutlierAttribute"]
]
+ ["OutlierAttribute", "value"] # Include "value" in the list
)
all_univariate_outliers = all_univariate_outliers[id_and_other_columns]

Expand All @@ -234,7 +300,7 @@

# Step 3: Combine Data
all_outliers = pd.concat(
[all_univariate_outliers, multivariate_outliers], ignore_index=True
[all_univariate_outliers_simple, multivariate_outliers], ignore_index=True
)

# Ensure that all_outliers has the same column order
Expand All @@ -261,13 +327,3 @@
all_outliers.to_excel(writer, sheet_name="All Outliers", index=False)

print(f"Outliers report saved to {excel_file_path}")


# Define a function to determine recommendation level based on the proportion of outliers
def determine_recommendation_level(proportion_outliers):
if proportion_outliers > 0.5: # More than 20% of data are outliers
return "high"
elif proportion_outliers > 0.3: # More than 10% of data are outliers
return "warning"
else:
return "info"
8 changes: 7 additions & 1 deletion outlier_detection_pack/pack_conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
"chart_type": "text",
"display_title": true,
"justify": true
},
{
"metric_key": "total_outliers_count",
"chart_type": "text",
"display_title": true,
"justify": true
}
],
"scoped": [
Expand All @@ -37,4 +43,4 @@
}
]
}
}
}
2 changes: 1 addition & 1 deletion outlier_detection_pack/properties.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ icon: icon.png
name: outlier_detection
type: reasonability
url: https://github.com/qalita-io/packs/tree/main/outlier_detection_pack
version: 2.0.1
version: 2.0.2
visibility: public
Loading

0 comments on commit 5cf61b9

Please sign in to comment.