Skip to content

Commit

Permalink
Merge pull request #23 from qalita-io/Improve-Compare-Data
Browse files Browse the repository at this point in the history
Improve compare data
  • Loading branch information
armandleopold authored Mar 17, 2024
2 parents 62ecbcf + 7ba3c51 commit d705b2d
Show file tree
Hide file tree
Showing 6 changed files with 129 additions and 50 deletions.
43 changes: 30 additions & 13 deletions data_compare_pack/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@


# Checking if the columns exist in the DataFrames
compare_col_list = pack.pack_config["job"]["compare_col_list"]
id_columns = pack.pack_config["job"]["id_columns"]
compare_col_list = pack.pack_config["job"].get("compare_col_list", [])
id_columns = pack.pack_config["job"].get("id_columns", [])
abs_tol = pack.pack_config["job"].get("abs_tol", 0.0001)
rel_tol = pack.pack_config["job"].get("rel_tol", 0)

# Create an intersection of source and target columns if compare_col_list is empty
if not compare_col_list:
if compare_col_list == []:
compare_col_list = list(
set(pack.df_source.columns).intersection(set(pack.df_target.columns))
)
Expand All @@ -37,6 +37,9 @@
# Combine compare_col_list and id_columns while removing duplicates
combined_columns_list = list(dict.fromkeys(compare_col_list + id_columns))

if len(id_columns) == 0:
id_columns = compare_col_list

# Creating subsets for source and target data with no repeated columns
df_source_subset = pack.df_source[combined_columns_list]
df_target_subset = pack.df_target[combined_columns_list]
Expand Down Expand Up @@ -226,19 +229,38 @@
]
)


# Extracting column labels
columnLabels = df_all_mismatch.columns.tolist()

# Converting the DataFrame into the desired format without row labels
# Dictionary to map the old suffix to the new one
suffix_mapping = {"_df1": "_source", "_df2": "_target"}

# Revise the loop to correctly process replacement without duplication
new_columnLabels = [
(
col
if not any(col.endswith(suffix) for suffix in suffix_mapping.keys())
else next(
col.replace(suffix, replacement)
for suffix, replacement in suffix_mapping.items()
if col.endswith(suffix)
)
)
for col in columnLabels
]

# Assuming `df_all_mismatch` is your DataFrame, rename its columns with the new labels
df_all_mismatch.columns = new_columnLabels

# Since you've updated column names, you don't need to change the way you convert the DataFrame
data_formatted = [
[{"value": row[col]} for col in df_all_mismatch.columns]
for index, row in df_all_mismatch.iterrows()
]

# The formatted data structure, now without rowLabels
# The formatted data structure, now with renamed labels
format_structure = {
"columnLabels": columnLabels,
"columnLabels": new_columnLabels, # Use the new column labels
"data": data_formatted,
}

Expand All @@ -247,12 +269,7 @@
[
{
"key": "recommendation_levels_mismatches",
"value": {"info": "<=0.5", "warning": ">0.5", "high": ">0.8"},
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
},
{
"key": "check_column",
"value": [combined_columns_list],
"value": {"info": "0", "warning": "0.5", "high": "0.8"},
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
},
{
Expand Down
97 changes: 63 additions & 34 deletions data_compare_pack/pack_conf.json
Original file line number Diff line number Diff line change
@@ -1,36 +1,65 @@
{
"job": {
"compare_col_list": [],
"id_columns": [],
"abs_tol": 0.0001,
"rel_tol": 0,
"source": {
"skiprows": 0
}
},
"charts": {
"overview": [
{
"metric_key": "score",
"chart_type": "text",
"display_title": true,
"justify": true
},
{
"metric_key": "recommendation_levels",
"chart_type": "recommendation_level_indicator",
"display_title": true
},
{
"metric_key": "check_column",
"chart_type": "check_table",
"display_title": true
},
{
"metric_key": "mismatches_table",
"chart_type": "table",
"display_title": true
}
]
"job": {
"compare_col_list": [],
"id_columns": [],
"abs_tol": 0.0001,
"rel_tol": 0,
"source": {
"skiprows": 0
}
}
},
"charts": {
"overview": [
{
"metric_key": "score",
"chart_type": "text",
"tooltip": {
"title": "Comparison Score [Higher is better]",
"content": "Proportion of data in source that matches the target."
},
"display_title": true,
"justify": true
},
{
"metric_key": "precision",
"chart_type": "text",
"tooltip": {
"title": "Precision [Higher is better]",
"content": "The portion of rows in the target that are correctly represented in the source dataset"
},
"display_title": true,
"justify": true
},
{
"metric_key": "recall",
"chart_type": "text",
"tooltip": {
"title": "Recall [Higher is better]",
"content": "The portion of rows in the source that are correctly represented in the target dataset"
},
"display_title": true,
"justify": true
},
{
"metric_key": "score",
"chart_type": "text",
"display_title": true,
"justify": true
},
{
"metric_key": "recommendation_levels_mismatches",
"chart_type": "recommendation_level_indicator",
"tooltip": {
"title": "Recommendation level's importance mapping",
"content": "Gives the recommendation level for proportions of mismatches"
},
"display_title": true
},
{
"metric_key": "mismatches_table",
"chart_type": "table",
"display_title": true
}
]
}
}
2 changes: 1 addition & 1 deletion data_compare_pack/properties.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ icon: icon.png
name: data_compare
type: consistency
url: https://github.com/qalita-io/packs/tree/main/data_compare_pack
version: 2.0.1
version: 2.0.21
visibility: public
30 changes: 29 additions & 1 deletion outlier_detection_pack/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,6 @@ def determine_recommendation_level(proportion_outliers):
}
)

pack.metrics.save()

# Define a threshold for considering a data point as an outlier
normality_threshold = pack.pack_config["job"][
Expand Down Expand Up @@ -278,6 +277,35 @@ def determine_recommendation_level(proportion_outliers):
)
all_univariate_outliers = all_univariate_outliers[id_and_other_columns]


# Extracting column labels
columnLabels = all_univariate_outliers.columns.tolist()

# Converting the DataFrame into the desired format without row labels
data_formatted = [
[{"value": row[col]} for col in all_univariate_outliers.columns]
for index, row in all_univariate_outliers.iterrows()
]

# The formatted data structure, now without rowLabels
format_structure = {
"columnLabels": columnLabels,
"data": data_formatted,
}

# Append the precision, recall, and F1 score to the metrics
pack.metrics.data.extend(
[
{
"key": "outliers_table",
"value": format_structure,
"scope": {"perimeter": "dataset", "value": pack.source_config["name"]},
},
]
)

pack.metrics.save()

# Step 2: Compile Multivariate Outliers
multivariate_outliers["index"] = (
multivariate_outliers.index
Expand Down
5 changes: 5 additions & 0 deletions outlier_detection_pack/pack_conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@
"chart_type": "text",
"display_title": true,
"justify": true
},
{
"metric_key": "outliers_table",
"chart_type": "table",
"display_title": true
}
],
"scoped": [
Expand Down
2 changes: 1 addition & 1 deletion outlier_detection_pack/properties.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ icon: icon.png
name: outlier_detection
type: reasonability
url: https://github.com/qalita-io/packs/tree/main/outlier_detection_pack
version: 2.0.2
version: 2.0.5
visibility: public

0 comments on commit d705b2d

Please sign in to comment.