fix(PandasDataFrameItem): Use JSON to serialize dataframe instead of …

…pickle (#621) Pandas Dataframes including complex objects (e.g. numpy arrays) were stored as-is. They are now serialized using JSON to make them environment-independent. --- Two native methods are available to serialize dataframe with multi-index, while keepping the index names: 1. Using table orientation with JSON serializer: ```python json = dataframe.to_json(orient="table") dataframe = pandas.read_json(json, orient="table", dtype=False) ``` This method fails when columns name is an integer. 2. Using record orientation with indexes as columns: ```python dataframe = dataframe.reset_index() json = dataframe.to_json(orient="records") dataframe = pandas.read_json(json, orient="records", dtype=False) ``` This method fails when the index has the same name as one of the columns. None of those methods being compatible, we decide to store indexes separately.
probabl-ai · Oct 29, 2024 · 5f2f229 · 5f2f229
1 parent 1571501
commit 5f2f229
Show file tree

Hide file tree

Showing 4 changed files with 119 additions and 19 deletions.
diff --git a/skore/src/skore/item/pandas_dataframe_item.py b/skore/src/skore/item/pandas_dataframe_item.py
@@ -23,9 +23,12 @@ class PandasDataFrameItem(Item):
     creation and update timestamps.
     """
 
+    ORIENT = "split"
+
     def __init__(
         self,
-        dataframe_dict: dict,
+        index_json: str,
+        dataframe_json: str,
         created_at: str | None = None,
         updated_at: str | None = None,
     ):
@@ -34,23 +37,43 @@ def __init__(
 
         Parameters
         ----------
-        dataframe_dict : dict
-            The dict representation of the dataframe.
+        index_json : json
+            The JSON representation of the dataframe's index.
+        dataframe_json : json
+            The JSON representation of the dataframe, without its index.
         created_at : str
             The creation timestamp in ISO format.
         updated_at : str
             The last update timestamp in ISO format.
         """
         super().__init__(created_at, updated_at)
 
-        self.dataframe_dict = dataframe_dict
+        self.index_json = index_json
+        self.dataframe_json = dataframe_json
 
     @cached_property
     def dataframe(self) -> pandas.DataFrame:
-        """The pandas DataFrame."""
+        """
+        The pandas DataFrame from the persistence.
+
+        Its content can differ from the original dataframe because it has been
+        serialized using pandas' `to_json` function and not pickled, in order to be
+        environment-independent.
+        """
+        import io
+
         import pandas
 
-        return pandas.DataFrame.from_dict(self.dataframe_dict, orient="tight")
+        with (
+            io.StringIO(self.index_json) as index_stream,
+            io.StringIO(self.dataframe_json) as df_stream,
+        ):
+            index = pandas.read_json(index_stream, orient=self.ORIENT, dtype=False)
+            index = index.set_index(list(index.columns))
+            dataframe = pandas.read_json(df_stream, orient=self.ORIENT, dtype=False)
+            dataframe.index = index.index
+
+            return dataframe
 
     @classmethod
     def factory(cls, dataframe: pandas.DataFrame) -> PandasDataFrameItem:
@@ -66,15 +89,43 @@ def factory(cls, dataframe: pandas.DataFrame) -> PandasDataFrameItem:
         -------
         PandasDataFrameItem
             A new PandasDataFrameItem instance.
+
+        Notes
+        -----
+        The dataframe must be JSON serializable.
         """
         import pandas
 
         if not isinstance(dataframe, pandas.DataFrame):
             raise TypeError(f"Type '{dataframe.__class__}' is not supported.")
 
-        instance = cls(dataframe_dict=dataframe.to_dict(orient="tight"))
-
-        # add dataframe as cached property
-        instance.dataframe = dataframe
+        # Two native methods are available to serialize dataframe with multi-index,
+        # while keeping the index names:
+        #
+        # 1. Using table orientation with JSON serializer:
+        #    ```python
+        #    json = dataframe.to_json(orient="table")
+        #    dataframe = pandas.read_json(json, orient="table", dtype=False)
+        #    ```
+        #
+        #    This method fails when a column name is an integer.
+        #
+        # 2. Using record orientation with indexes as columns:
+        #    ```python
+        #    dataframe = dataframe.reset_index()
+        #    json = dataframe.to_json(orient="records")
+        #    dataframe = pandas.read_json(json, orient="records", dtype=False)
+        #    ```
+        #
+        #    This method fails when the index has the same name as one of the columns.
+        #
+        # None of those methods being compatible, we decide to store indexes separately.
+
+        index = dataframe.index.to_frame(index=False)
+        dataframe = dataframe.reset_index(drop=True)
+        instance = cls(
+            index_json=index.to_json(orient=PandasDataFrameItem.ORIENT),
+            dataframe_json=dataframe.to_json(orient=PandasDataFrameItem.ORIENT),
+        )
 
         return instance
diff --git a/skore/src/skore/ui/project_routes.py b/skore/src/skore/ui/project_routes.py
@@ -54,7 +54,7 @@ def __serialize_project(project: Project) -> SerializedProject:
             value = item.array_list
             media_type = "text/markdown"
         elif isinstance(item, PandasDataFrameItem):
-            value = item.dataframe_dict
+            value = item.dataframe.to_dict(orient="tight")
             media_type = "application/vnd.dataframe+json"
         elif isinstance(item, PandasSeriesItem):
             value = item.series_list

diff --git a/skore/tests/unit/item/test_pandas_dataframe_item.py b/skore/tests/unit/item/test_pandas_dataframe_item.py
@@ -1,5 +1,6 @@
+import numpy as np
 import pytest
-from pandas import DataFrame
+from pandas import DataFrame, Index, MultiIndex
 from pandas.testing import assert_frame_equal
 from skore.item import PandasDataFrameItem
 
@@ -11,23 +12,63 @@ def monkeypatch_datetime(self, monkeypatch, MockDatetime):
 
     @pytest.mark.order(0)
     def test_factory(self, mock_nowstr):
-        dataframe = DataFrame([{"key": "value"}])
-        dataframe_dict = dataframe.to_dict(orient="tight")
+        dataframe = DataFrame([{"key": "value"}], Index([0], name="myIndex"))
+
+        orient = PandasDataFrameItem.ORIENT
+        index_json = dataframe.index.to_frame(index=False).to_json(orient=orient)
+        dataframe_json = dataframe.reset_index(drop=True).to_json(orient=orient)
 
         item = PandasDataFrameItem.factory(dataframe)
 
-        assert item.dataframe_dict == dataframe_dict
+        assert item.index_json == index_json
+        assert item.dataframe_json == dataframe_json
         assert item.created_at == mock_nowstr
         assert item.updated_at == mock_nowstr
 
     @pytest.mark.order(1)
     def test_dataframe(self, mock_nowstr):
-        dataframe = DataFrame([{"key": "value"}])
-        dataframe_dict = dataframe.to_dict(orient="tight")
+        dataframe = DataFrame([{"key": "value"}], Index([0], name="myIndex"))
+
+        orient = PandasDataFrameItem.ORIENT
+        index_json = dataframe.index.to_frame(index=False).to_json(orient=orient)
+        dataframe_json = dataframe.reset_index(drop=True).to_json(orient=orient)
+
+        item1 = PandasDataFrameItem.factory(dataframe)
+        item2 = PandasDataFrameItem(
+            index_json=index_json,
+            dataframe_json=dataframe_json,
+            created_at=mock_nowstr,
+            updated_at=mock_nowstr,
+        )
+
+        assert_frame_equal(item1.dataframe, dataframe)
+        assert_frame_equal(item2.dataframe, dataframe)
+
+    @pytest.mark.order(1)
+    def test_dataframe_with_complex_object(self, mock_nowstr):
+        dataframe = DataFrame([{"key": np.array([1])}], Index([0], name="myIndex"))
+        item = PandasDataFrameItem.factory(dataframe)
+
+        assert type(item.dataframe["key"].iloc[0]) is list
+
+    @pytest.mark.order(1)
+    def test_dataframe_with_integer_columns_name_and_multiindex(self, mock_nowstr):
+        dataframe = DataFrame(
+            [[">70", "1M", "M", 1], [">70", "2F", "F", 2]],
+            MultiIndex.from_arrays(
+                [["france", "usa"], ["paris", "nyc"], ["1", "1"]],
+                names=("country", "city", "district"),
+            ),
+        )
+
+        orient = PandasDataFrameItem.ORIENT
+        index_json = dataframe.index.to_frame(index=False).to_json(orient=orient)
+        dataframe_json = dataframe.reset_index(drop=True).to_json(orient=orient)
 
         item1 = PandasDataFrameItem.factory(dataframe)
         item2 = PandasDataFrameItem(
-            dataframe_dict=dataframe_dict,
+            index_json=index_json,
+            dataframe_json=dataframe_json,
             created_at=mock_nowstr,
             updated_at=mock_nowstr,
         )

diff --git a/skore/tests/unit/test_project.py b/skore/tests/unit/test_project.py
@@ -45,7 +45,15 @@ def test_put_dict_item(in_memory_project):
 
 
 def test_put_pandas_dataframe(in_memory_project):
-    dataframe = pandas.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+    dataframe = pandas.DataFrame(
+        {
+            "A": [1, 2, 3],
+            "B": [4, 5, 6],
+            "C": [7, 8, 9],
+        },
+        index=pandas.Index([0, 1, 2], name="myIndex"),
+    )
+
     in_memory_project.put("pandas_dataframe", dataframe)
     pandas.testing.assert_frame_equal(
         in_memory_project.get("pandas_dataframe"), dataframe