Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TypeError in TableReport when dataframe contains unhashable values #1066

Closed
jeromedockes opened this issue Sep 10, 2024 · 0 comments · Fixed by #1087
Closed

TypeError in TableReport when dataframe contains unhashable values #1066

jeromedockes opened this issue Sep 10, 2024 · 0 comments · Fixed by #1087
Labels
bug Something isn't working

Comments

@jeromedockes
Copy link
Member

jeromedockes commented Sep 10, 2024

Describe the bug

at some point the table report builds a dictionary {value: count} for non-numeric values. this raises an error when the value is not hashable, eg a python list in an pandas series with object dtype

reported by @Vincent-Maladiere here

Steps/Code to Reproduce

import pandas as pd
from skrub import TableReport

TableReport(
    pd.DataFrame(dict(a=[[1]]))
)

Expected Results

no error

Actual Results

	"name": "TypeError",
	"message": "unhashable type: 'list'",
	"stack": "---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File ~/dev/inria/skrub/.venv/lib/python3.10/site-packages/IPython/core/formatters.py:347, in BaseFormatter.__call__(self, obj)
    345     method = get_real_method(obj, self.print_method)
    346     if method is not None:
--> 347         return method()
    348     return None
    349 else:

File ~/dev/inria/skrub/skrub/_reporting/_table_report.py:167, in TableReport._repr_html_(self)
    166 def _repr_html_(self):
--> 167     return self._repr_mimebundle_()[\"text/html\"]

File ~/dev/inria/skrub/skrub/_reporting/_table_report.py:164, in TableReport._repr_mimebundle_(***failed resolving arguments***)
    162 def _repr_mimebundle_(self, include=None, exclude=None):
    163     del include, exclude
--> 164     return {\"text/html\": self.html_snippet()}

File ~/dev/inria/skrub/skrub/_reporting/_table_report.py:143, in TableReport.html_snippet(self)
    134 def html_snippet(self):
    135     \"\"\"Get the report as an HTML fragment that can be inserted in a page.
    136 
    137     Returns
   (...)
    140         The HTML snippet.
    141     \"\"\"
    142     return to_html(
--> 143         self._summary_with_plots,
    144         standalone=False,
    145         column_filters=self.column_filters,
    146     )

File ~/miniforge3/lib/python3.10/functools.py:981, in cached_property.__get__(self, instance, owner)
    979 val = cache.get(self.attrname, _NOT_FOUND)
    980 if val is _NOT_FOUND:
--> 981     val = self.func(instance)
    982     try:
    983         cache[self.attrname] = val

File ~/dev/inria/skrub/skrub/_reporting/_table_report.py:104, in TableReport._summary_with_plots(self)
    102 @functools.cached_property
    103 def _summary_with_plots(self):
--> 104     return summarize_dataframe(
    105         self.dataframe, with_plots=True, title=self.title, **self._summary_kwargs
    106     )

File ~/dev/inria/skrub/skrub/_reporting/_summarize.py:64, in summarize_dataframe(df, order_by, with_plots, title)
     59 for position, column_name in enumerate(sbd.column_names(df)):
     60     print(
     61         f\"Processing column {position + 1: >3} / {n_columns}\", end=\"\\r\", flush=True
     62     )
     63     summary[\"columns\"].append(
---> 64         _summarize_column(
     65             sbd.col(df, column_name),
     66             position,
     67             dataframe_summary=summary,
     68             with_plots=with_plots,
     69             order_by_column=None if order_by is None else sbd.col(df, order_by),
     70         )
     71     )
     72 print(flush=True)
     73 summary[\"n_constant_columns\"] = sum(
     74     c[\"value_is_constant\"] for c in summary[\"columns\"]
     75 )

File ~/dev/inria/skrub/skrub/_reporting/_summarize.py:104, in _summarize_column(column, position, dataframe_summary, with_plots, order_by_column)
    102     summary[\"plot_names\"] = []
    103     return summary
--> 104 _add_value_counts(
    105     summary, column, dataframe_summary=dataframe_summary, with_plots=with_plots
    106 )
    107 _add_numeric_summary(
    108     summary,
    109     column,
   (...)
    112     order_by_column=order_by_column,
    113 )
    114 _add_datetime_summary(summary, column, with_plots=with_plots)

File ~/dev/inria/skrub/skrub/_reporting/_summarize.py:136, in _add_value_counts(summary, column, dataframe_summary, with_plots)
    134     summary[\"high_cardinality\"] = True
    135     return
--> 136 n_unique, value_counts = _utils.top_k_value_counts(column, k=10)
    137 # if the column contains all nulls, _add_value_counts does not get called
    138 assert n_unique > 0

File ~/dev/inria/skrub/skrub/_reporting/_utils.py:48, in top_k_value_counts(column, k)
     46 counts = sbd.sort(counts, by=\"count\", descending=True)
     47 counts = sbd.slice(counts, k)
---> 48 return n_unique, dict(zip(*to_dict(counts).values()))

TypeError: unhashable type: 'list'"

Versions

all
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

Successfully merging a pull request may close this issue.

1 participant