From b0064d531cfa395260fbc78a5048ac87e3c77d8f Mon Sep 17 00:00:00 2001 From: EdenWuyifan Date: Sun, 9 Jun 2024 20:35:31 -0400 Subject: [PATCH] fix graph style and make the graph changes last --- .gitignore | 3 + bdikit/api.py | 16 +++- .../scope_reducing_manager.py | 8 ++ bdikit/utils.py | 11 +++ bdikit/visualization/mappings.py | 2 +- bdikit/visualization/scope_reducing.py | 88 +++++++++++++------ 6 files changed, 99 insertions(+), 29 deletions(-) diff --git a/.gitignore b/.gitignore index c9160e3a..9937961c 100644 --- a/.gitignore +++ b/.gitignore @@ -74,3 +74,6 @@ nosetests.xml # Model *.pt **/*.pt + +# Scope Reducing Json +examples/*.json \ No newline at end of file diff --git a/bdikit/api.py b/bdikit/api.py index e79556dc..1c06c3a5 100644 --- a/bdikit/api.py +++ b/bdikit/api.py @@ -10,8 +10,10 @@ from bdikit.utils import get_gdc_data from os.path import join, dirname import os +import logging os.environ["TOKENIZERS_PARALLELISM"] = "false" # Disable huggingface messages +logger = logging.getLogger(__name__) GDC_DATA_PATH = join(dirname(__file__), "./resource/gdc_table.csv") @@ -64,7 +66,19 @@ def reduce_scope(self): """ self.scope_manager = ScopeReducingManager(self.dataset, self.global_table) self.reduced_scope = self.scope_manager.reduce() - plot_reduce_scope(self.reduced_scope, self.dataset) + return self.scope_manager.get_heatmap() + + def update_scope(self, reduced_scope=None): + if self.scope_manager is None: + logger.warning("Scope manager not initialized. Please run reduce_scope() first.") + return + + if reduced_scope is None: + self.reduced_scope = self.scope_manager.visualization_manager.reduced_scope + else: + self.reduced_scope = reduced_scope + + return self.reduced_scope def map_columns(self, algorithm="SimFloodAlgorithm"): """ diff --git a/bdikit/mapping_recommendation/scope_reducing_manager.py b/bdikit/mapping_recommendation/scope_reducing_manager.py index 4e398762..3610e515 100644 --- a/bdikit/mapping_recommendation/scope_reducing_manager.py +++ b/bdikit/mapping_recommendation/scope_reducing_manager.py @@ -1,4 +1,5 @@ from bdikit.mapping_algorithms.scope_reducing.algorithms import YurongReducer +from bdikit.visualization.scope_reducing import SRHeatMapManager class ScopeReducingManager: @@ -6,7 +7,14 @@ def __init__(self, dataset, target_domain): self.dataset = dataset self.target_domain = target_domain self.best_method = YurongReducer() + self.visualization_manager = None def reduce(self): reducings = self.best_method.reduce_scope(self.dataset) + self.visualization_manager = SRHeatMapManager(self.dataset, reducings) return reducings + + def get_heatmap(self): + self.visualization_manager.get_heatmap() + return self.visualization_manager.plot_heatmap() + diff --git a/bdikit/utils.py b/bdikit/utils.py index f55c6e1e..bf876e56 100644 --- a/bdikit/utils.py +++ b/bdikit/utils.py @@ -53,3 +53,14 @@ def get_gdc_metadata(): metadata[key] = data return metadata + + +def get_gdc_layered_metadata(): + metadata = {} + gdc_schema = read_gdc_schema() + + for subschema, values in gdc_schema.items(): + for key, data in values["properties"].items(): + metadata[key] = (subschema, data) + + return metadata diff --git a/bdikit/visualization/mappings.py b/bdikit/visualization/mappings.py index eb08540e..b95974a8 100644 --- a/bdikit/visualization/mappings.py +++ b/bdikit/visualization/mappings.py @@ -10,7 +10,7 @@ def plot_reduce_scope(reduced_scope, dataset): scope_explorer = SRHeatMapManager(dataset, reduced_scope) scope_explorer.get_heatmap() - display(scope_explorer.plot_heatmap()) + return scope_explorer.plot_heatmap() def plot_column_mappings(column_mappings): diff --git a/bdikit/visualization/scope_reducing.py b/bdikit/visualization/scope_reducing.py index 7cc1879a..c947a181 100644 --- a/bdikit/visualization/scope_reducing.py +++ b/bdikit/visualization/scope_reducing.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd import panel as pn -from bdikit.utils import get_gdc_metadata, read_gdc_schema +from bdikit.utils import get_gdc_layered_metadata, get_gdc_metadata, read_gdc_schema from Levenshtein import distance from natsort import index_natsorted from sklearn.cluster import AffinityPropagation @@ -18,7 +18,7 @@ def clean_reduced_scope(reduced_scope, max_chars_samples): - gdc_metadata = get_gdc_metadata() + gdc_metadata = get_gdc_layered_metadata() candidates_dfs = {} @@ -26,9 +26,10 @@ def clean_reduced_scope(reduced_scope, max_chars_samples): column_name = column_data["Candidate column"] recommendations = [] for candidate_name, candidate_similarity in column_data["Top k columns"]: - candidate_description = gdc_metadata[candidate_name].get("description", "") + subschema, gdc_data = gdc_metadata[candidate_name] + candidate_description = gdc_data.get("description", "") candidate_description = candidate_description - candidate_values = ", ".join(gdc_metadata[candidate_name].get("enum", [])) + candidate_values = ", ".join(gdc_data.get("enum", [])) candidate_values = truncate_text(candidate_values, max_chars_samples) recommendations.append( ( @@ -36,12 +37,19 @@ def clean_reduced_scope(reduced_scope, max_chars_samples): candidate_similarity, candidate_description, candidate_values, + subschema, ) ) candidates_dfs[column_name] = pd.DataFrame( recommendations, - columns=["Candidate", "Similarity", "Description", "Values (sample)"], + columns=[ + "Candidate", + "Similarity", + "Description", + "Values (sample)", + "Subschema", + ], ) return candidates_dfs @@ -225,6 +233,7 @@ def get_heatmap(self): "Value": c[1], "Description": cadidate_info["Description"].values[0], "Values (sample)": cadidate_info["Values (sample)"].values[0], + "Subschema": cadidate_info["Subschema"].values[0], } ) rec_table.append(col_dict) @@ -368,26 +377,47 @@ def get_clusters(self): clusters[exemplar] = cluster self.clusters = clusters - def _plot_heatmap_base(self, heatmap_rec_list): + def _plot_heatmap_base(self, heatmap_rec_list, show_subschema): single = alt.selection_point(name="single") - base = ( - alt.Chart(heatmap_rec_list) - .mark_rect(size=100) - .encode( - y=alt.X("Column:O", sort=None), - x=alt.X(f"Recommendation:O", sort=None), - color=alt.condition(single, "Value:Q", alt.value("lightgray")), - # color="Value:Q", - tooltip=[ - alt.Tooltip("Column", title="Column"), - alt.Tooltip("Recommendation", title="Recommendation"), - alt.Tooltip("Value", title="Correlation Score"), - alt.Tooltip("Description", title="Description"), - alt.Tooltip("Values (sample)", title="Values (sample)"), - ], + if show_subschema: + base = ( + alt.Chart(heatmap_rec_list) + .mark_rect(size=100) + .encode( + y=alt.X("Column:O", sort=None), + x=alt.X(f"Recommendation:O", sort=None), + color=alt.condition(single, "Value:Q", alt.value("lightgray")), + # color="Value:Q", + tooltip=[ + alt.Tooltip("Column", title="Column"), + alt.Tooltip("Recommendation", title="Recommendation"), + alt.Tooltip("Value", title="Correlation Score"), + alt.Tooltip("Description", title="Description"), + alt.Tooltip("Values (sample)", title="Values (sample)"), + ], + facet=alt.Facet("Subschema:O", columns=1), + ) + .add_params(single) + ) + else: + base = ( + alt.Chart(heatmap_rec_list) + .mark_rect(size=100) + .encode( + y=alt.X("Column:O", sort=None), + x=alt.X(f"Recommendation:O", sort=None), + color=alt.condition(single, "Value:Q", alt.value("lightgray")), + # color="Value:Q", + tooltip=[ + alt.Tooltip("Column", title="Column"), + alt.Tooltip("Recommendation", title="Recommendation"), + alt.Tooltip("Value", title="Correlation Score"), + alt.Tooltip("Description", title="Description"), + alt.Tooltip("Values (sample)", title="Values (sample)"), + ], + ) + .add_params(single) ) - .add_params(single) - ) return pn.pane.Vega(base) def _plot_selected_row(self, heatmap_rec_list, selection): @@ -443,7 +473,6 @@ def _candidates_table(self, heatmap_rec_list, selection): def _plot_column_histogram(self, column): if self.dataset[column].dtype == "float64": - print(column) chart = ( alt.Chart(self.dataset.fillna("Null"), height=300) .mark_bar() @@ -485,6 +514,7 @@ def _plot_pane( subschemas=[], n_similar=0, threshold=0.5, + show_subschema=False, acc_click=0, rej_click=0, ): @@ -522,7 +552,7 @@ def _plot_pane( heatmap_rec_list["Recommendation"].isin(subschema_rec_cols) ] - heatmap_pane = self._plot_heatmap_base(heatmap_rec_list) + heatmap_pane = self._plot_heatmap_base(heatmap_rec_list, show_subschema) cand_table = pn.bind( self._candidates_table, heatmap_rec_list, @@ -557,7 +587,7 @@ def plot_heatmap(self): name="Recommendation subschema", options=self.subschemas, width=220 ) n_similar_slider = pn.widgets.IntSlider( - name="N Similar", start=1, end=5, value=5, width=220 + name="N Similar", start=0, end=5, value=0, width=220 ) thresh_slider = pn.widgets.EditableFloatSlider( name="Threshold", start=0, end=1.0, step=0.01, value=0.1, width=220 @@ -567,6 +597,9 @@ def plot_heatmap(self): rej_button = pn.widgets.Button(name="Decline Match", button_type="danger") + # Style + show_subschema = pn.widgets.Checkbox(name="Show subschema", value=False) + def on_click_accept_match(event): self._accept_match() @@ -582,13 +615,14 @@ def on_click_reject_match(event): select_rec_groups, n_similar_slider, thresh_slider, + show_subschema, acc_button.param.clicks, rej_button.param.clicks, ) column_left = pn.Row( - "# Column", select_column, + show_subschema, select_rec_groups, n_similar_slider, thresh_slider,