Skip to content

Commit

Permalink
#8
Browse files Browse the repository at this point in the history
  • Loading branch information
armandleopold committed Feb 1, 2024
1 parent b8bd8b1 commit 298d4b6
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 46 deletions.
106 changes: 61 additions & 45 deletions duplicates_finder_pack/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,23 +25,16 @@

df = load_data(source_config, pack_config)

############################ Metrics
if "job" in pack_config and "compute_uniqueness_columns" in pack_config["job"]:
# Compute the uniqueness columns
uniqueness_columns = pack_config["job"]["compute_uniqueness_columns"]
else:
# If the list of columns is not specified, use all date columns
uniqueness_columns = df.columns

recommendations = []

# # Calculate the duplicates score for each column
# duplicates_scores = []
# # Updated Duplicate Score Calculation
# for col in df.columns:
# duplicates = df.duplicated(subset=col).sum()
# total_count = len(df)
# duplicates_score = duplicates / total_count
# duplicates_scores.append({
# "key": "duplicates_score",
# "value": duplicates_score,
# "scope": {"perimeter": "column", "value": col}
# })

# # Convert the completeness scores to DataFrame
# duplicates_scores_df = pd.DataFrame(duplicates_scores)
############################ Metrics

# Calculate the total number of duplicate rows in the dataset
total_duplicates = df.duplicated().sum()
Expand All @@ -53,20 +46,67 @@
# Invert the score
inverted_duplication_score = 1 - original_duplication_score

# Step 1: Filter the DataFrame based on the specified columns
df_subset = df[uniqueness_columns]

# Step 2: Calculate the number of duplicate rows based on this subset
total_duplicates_subset = df_subset.duplicated().sum()

# Calculate the scoped duplication score
scoped_duplication_score = round(total_duplicates_subset / total_rows if total_rows > 0 else 0, 2)

# Invert the score for the scoped scenario
scoped_score = 1 - scoped_duplication_score

# Use the scoped_score if the compute_uniqueness_columns is specified in the pack_config
if "job" in pack_config and "compute_uniqueness_columns" in pack_config["job"]:
score = scoped_score
# Add more details to the recommendation if the scoped_score is used
if score < 0.9:
uniqueness_columns_str = ", ".join(uniqueness_columns)
recommendation = {
"content": f"dataset '{source_config['name']}' has a high duplication rate of {scoped_duplication_score*100}% based on the subset of columns: {uniqueness_columns_str}. Consider reviewing these columns for data cleaning.",
"type": "Duplicates",
"scope": {"perimeter": "dataset", "value": source_config["name"]},
"level": utils.determine_recommendation_level(scoped_duplication_score)
}
recommendations.append(recommendation)
else:
score = inverted_duplication_score

# Add the inverted duplication score to the metrics
aggregated_score_entry = {
"key": "score",
"value": inverted_duplication_score,
"value": score,
"scope": {"perimeter": "dataset", "value": source_config["name"]},
}

aggregated_score_df = pd.DataFrame([aggregated_score_entry])

############################ Recommendations
# Create metric entries as DataFrames
total_duplicates_df = pd.DataFrame([{
"key": "duplicates",
"value": total_duplicates,
"scope": {"perimeter": "dataset", "value": source_config["name"]},
}])
# Use pd.concat to add the total duplicates entry to the metrics
aggregated_score_df = pd.concat([aggregated_score_df, total_duplicates_df], ignore_index=True)

# Add the total duplicates entry to the metrics
aggregated_score_df = pd.concat([aggregated_score_df, total_duplicates_df], ignore_index=True)

# Check if scoped score is calculated and add its metrics
if "job" in pack_config and "compute_uniqueness_columns" in pack_config["job"]:
scoped_duplicates_df = pd.DataFrame([{
"key": "duplicates",
"value": total_duplicates_subset,
"scope": {"perimeter": "dataset", "value": ", ".join(uniqueness_columns)},
}])
aggregated_score_df = pd.concat([aggregated_score_df, scoped_duplicates_df], ignore_index=True)

recommendations = []
############################ Recommendations

if inverted_duplication_score < 0.9:
if score < 0.9:
recommendation = {
"content": f"dataset '{source_config['name']}' has a duplication rate of {original_duplication_score*100}%. Consider reviewing for data cleaning.",
"type": "Duplicates",
Expand All @@ -75,31 +115,10 @@
}
recommendations.append(recommendation)

# # Generate Recommendations Based on Duplicate Scores
# for score in duplicates_scores:
# column_name = score["scope"]["value"]
# dup_score = score["value"]
# dup_rate_percentage = round(dup_score * 100, 2) # Convert to percentage

# # Only add recommendations for significant duplication rates
# if dup_score > 0.1: # Adjust this threshold as needed
# recommendation = {
# "content": f"Column '{column_name}' has a duplication rate of {dup_rate_percentage}%. Consider reviewing for data cleaning.",
# "type": "Duplicates",
# "scope": {"perimeter": "column", "value": column_name},
# "level": utils.determine_recommendation_level(dup_score)
# }
# recommendations.append(recommendation)

# Convert the recommendations list to a DataFrame
recommendations_df = pd.DataFrame(recommendations)

############################ Schemas

schemas_data = []

# Concatenate all the DataFrames
# metrics = pd.concat([duplicates_scores_df, aggregated_score_df], ignore_index=True)
metrics = pd.concat([aggregated_score_df], ignore_index=True)

# Convert the DataFrames to JSON strings
Expand All @@ -115,7 +134,4 @@
json.dump(metrics_data, f, indent=4)

with open("recommendations.json", "w", encoding="utf-8") as f:
json.dump(recommendations_data, f, indent=4)

with open("schemas.json", "w", encoding="utf-8") as f:
json.dump(schemas_data, f, indent=4)
json.dump(recommendations_data, f, indent=4)
9 changes: 9 additions & 0 deletions duplicates_finder_pack/pack_conf.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,20 @@
{
"job": {
"compute_uniqueness_columns": []
},
"charts": {
"overview": [
{
"metric_key": "score",
"chart_type": "text",
"display_title": true,
"justify": true
},
{
"metric_key": "duplicates",
"chart_type": "text",
"display_title": true,
"justify": true
}
]
}
Expand Down
2 changes: 1 addition & 1 deletion duplicates_finder_pack/properties.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ icon: icon.png
name: duplicates_finder
type: uniqueness
url: https://github.com/qalita-io/packs/tree/main/duplicates_finder_pack
version: 1.1.0
version: 1.1.4
visibility: public

0 comments on commit 298d4b6

Please sign in to comment.