Skip to content

Commit

Permalink
fix(PandasDataFrameItem): Use JSON to serialize dataframe instead of …
Browse files Browse the repository at this point in the history
…pickle (#621)

Pandas Dataframes including complex objects (e.g. numpy arrays) were
stored as-is.
They are now serialized using JSON to make them environment-independent.

---

Two native methods are available to serialize dataframe with
multi-index, while
keepping the index names:
1. Using table orientation with JSON serializer:
```python
json = dataframe.to_json(orient="table")
dataframe = pandas.read_json(json, orient="table", dtype=False)
```
This method fails when columns name is an integer.
2. Using record orientation with indexes as columns:
```python
dataframe = dataframe.reset_index()
json = dataframe.to_json(orient="records")
dataframe = pandas.read_json(json, orient="records", dtype=False)
```
This method fails when the index has the same name as one of the
columns.
None of those methods being compatible, we decide to store indexes
separately.
  • Loading branch information
thomass-dev authored Oct 29, 2024
1 parent 1571501 commit 5f2f229
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 19 deletions.
71 changes: 61 additions & 10 deletions skore/src/skore/item/pandas_dataframe_item.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,12 @@ class PandasDataFrameItem(Item):
creation and update timestamps.
"""

ORIENT = "split"

def __init__(
self,
dataframe_dict: dict,
index_json: str,
dataframe_json: str,
created_at: str | None = None,
updated_at: str | None = None,
):
Expand All @@ -34,23 +37,43 @@ def __init__(
Parameters
----------
dataframe_dict : dict
The dict representation of the dataframe.
index_json : json
The JSON representation of the dataframe's index.
dataframe_json : json
The JSON representation of the dataframe, without its index.
created_at : str
The creation timestamp in ISO format.
updated_at : str
The last update timestamp in ISO format.
"""
super().__init__(created_at, updated_at)

self.dataframe_dict = dataframe_dict
self.index_json = index_json
self.dataframe_json = dataframe_json

@cached_property
def dataframe(self) -> pandas.DataFrame:
"""The pandas DataFrame."""
"""
The pandas DataFrame from the persistence.
Its content can differ from the original dataframe because it has been
serialized using pandas' `to_json` function and not pickled, in order to be
environment-independent.
"""
import io

import pandas

return pandas.DataFrame.from_dict(self.dataframe_dict, orient="tight")
with (
io.StringIO(self.index_json) as index_stream,
io.StringIO(self.dataframe_json) as df_stream,
):
index = pandas.read_json(index_stream, orient=self.ORIENT, dtype=False)
index = index.set_index(list(index.columns))
dataframe = pandas.read_json(df_stream, orient=self.ORIENT, dtype=False)
dataframe.index = index.index

return dataframe

@classmethod
def factory(cls, dataframe: pandas.DataFrame) -> PandasDataFrameItem:
Expand All @@ -66,15 +89,43 @@ def factory(cls, dataframe: pandas.DataFrame) -> PandasDataFrameItem:
-------
PandasDataFrameItem
A new PandasDataFrameItem instance.
Notes
-----
The dataframe must be JSON serializable.
"""
import pandas

if not isinstance(dataframe, pandas.DataFrame):
raise TypeError(f"Type '{dataframe.__class__}' is not supported.")

instance = cls(dataframe_dict=dataframe.to_dict(orient="tight"))

# add dataframe as cached property
instance.dataframe = dataframe
# Two native methods are available to serialize dataframe with multi-index,
# while keeping the index names:
#
# 1. Using table orientation with JSON serializer:
# ```python
# json = dataframe.to_json(orient="table")
# dataframe = pandas.read_json(json, orient="table", dtype=False)
# ```
#
# This method fails when a column name is an integer.
#
# 2. Using record orientation with indexes as columns:
# ```python
# dataframe = dataframe.reset_index()
# json = dataframe.to_json(orient="records")
# dataframe = pandas.read_json(json, orient="records", dtype=False)
# ```
#
# This method fails when the index has the same name as one of the columns.
#
# None of those methods being compatible, we decide to store indexes separately.

index = dataframe.index.to_frame(index=False)
dataframe = dataframe.reset_index(drop=True)
instance = cls(
index_json=index.to_json(orient=PandasDataFrameItem.ORIENT),
dataframe_json=dataframe.to_json(orient=PandasDataFrameItem.ORIENT),
)

return instance
2 changes: 1 addition & 1 deletion skore/src/skore/ui/project_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def __serialize_project(project: Project) -> SerializedProject:
value = item.array_list
media_type = "text/markdown"
elif isinstance(item, PandasDataFrameItem):
value = item.dataframe_dict
value = item.dataframe.to_dict(orient="tight")
media_type = "application/vnd.dataframe+json"
elif isinstance(item, PandasSeriesItem):
value = item.series_list
Expand Down
55 changes: 48 additions & 7 deletions skore/tests/unit/item/test_pandas_dataframe_item.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import pytest
from pandas import DataFrame
from pandas import DataFrame, Index, MultiIndex
from pandas.testing import assert_frame_equal
from skore.item import PandasDataFrameItem

Expand All @@ -11,23 +12,63 @@ def monkeypatch_datetime(self, monkeypatch, MockDatetime):

@pytest.mark.order(0)
def test_factory(self, mock_nowstr):
dataframe = DataFrame([{"key": "value"}])
dataframe_dict = dataframe.to_dict(orient="tight")
dataframe = DataFrame([{"key": "value"}], Index([0], name="myIndex"))

orient = PandasDataFrameItem.ORIENT
index_json = dataframe.index.to_frame(index=False).to_json(orient=orient)
dataframe_json = dataframe.reset_index(drop=True).to_json(orient=orient)

item = PandasDataFrameItem.factory(dataframe)

assert item.dataframe_dict == dataframe_dict
assert item.index_json == index_json
assert item.dataframe_json == dataframe_json
assert item.created_at == mock_nowstr
assert item.updated_at == mock_nowstr

@pytest.mark.order(1)
def test_dataframe(self, mock_nowstr):
dataframe = DataFrame([{"key": "value"}])
dataframe_dict = dataframe.to_dict(orient="tight")
dataframe = DataFrame([{"key": "value"}], Index([0], name="myIndex"))

orient = PandasDataFrameItem.ORIENT
index_json = dataframe.index.to_frame(index=False).to_json(orient=orient)
dataframe_json = dataframe.reset_index(drop=True).to_json(orient=orient)

item1 = PandasDataFrameItem.factory(dataframe)
item2 = PandasDataFrameItem(
index_json=index_json,
dataframe_json=dataframe_json,
created_at=mock_nowstr,
updated_at=mock_nowstr,
)

assert_frame_equal(item1.dataframe, dataframe)
assert_frame_equal(item2.dataframe, dataframe)

@pytest.mark.order(1)
def test_dataframe_with_complex_object(self, mock_nowstr):
dataframe = DataFrame([{"key": np.array([1])}], Index([0], name="myIndex"))
item = PandasDataFrameItem.factory(dataframe)

assert type(item.dataframe["key"].iloc[0]) is list

@pytest.mark.order(1)
def test_dataframe_with_integer_columns_name_and_multiindex(self, mock_nowstr):
dataframe = DataFrame(
[[">70", "1M", "M", 1], [">70", "2F", "F", 2]],
MultiIndex.from_arrays(
[["france", "usa"], ["paris", "nyc"], ["1", "1"]],
names=("country", "city", "district"),
),
)

orient = PandasDataFrameItem.ORIENT
index_json = dataframe.index.to_frame(index=False).to_json(orient=orient)
dataframe_json = dataframe.reset_index(drop=True).to_json(orient=orient)

item1 = PandasDataFrameItem.factory(dataframe)
item2 = PandasDataFrameItem(
dataframe_dict=dataframe_dict,
index_json=index_json,
dataframe_json=dataframe_json,
created_at=mock_nowstr,
updated_at=mock_nowstr,
)
Expand Down
10 changes: 9 additions & 1 deletion skore/tests/unit/test_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,15 @@ def test_put_dict_item(in_memory_project):


def test_put_pandas_dataframe(in_memory_project):
dataframe = pandas.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
dataframe = pandas.DataFrame(
{
"A": [1, 2, 3],
"B": [4, 5, 6],
"C": [7, 8, 9],
},
index=pandas.Index([0, 1, 2], name="myIndex"),
)

in_memory_project.put("pandas_dataframe", dataframe)
pandas.testing.assert_frame_equal(
in_memory_project.get("pandas_dataframe"), dataframe
Expand Down

0 comments on commit 5f2f229

Please sign in to comment.