From 03e719b18f1b45becb5042c0f95adeecb5cd8043 Mon Sep 17 00:00:00 2001 From: mski_iksm <34547057+mski-iksm@users.noreply.github.com> Date: Thu, 18 Jul 2024 10:49:10 +0900 Subject: [PATCH] fix load_dill_with_pandas_backward_compatibility() bug (#382) * seek(0) * fix FileLike * add seekable check * change order * check seekable * add assert seekable * add seekable * add comments --- gokart/file_processor.py | 6 ++++-- gokart/utils.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/gokart/file_processor.py b/gokart/file_processor.py index 707b3112..21b8b77f 100644 --- a/gokart/file_processor.py +++ b/gokart/file_processor.py @@ -82,8 +82,10 @@ def format(self): return luigi.format.Nop def load(self, file): - if not ObjectStorage.is_buffered_reader(file): - # we cannot use dill.load(file) because ReadableS3File does not have 'readline' method + if not file.seekable(): + # load_dill_with_pandas_backward_compatibility() requires file with seek() and readlines() implemented. + # Therefore, we need to wrap with BytesIO which makes file seekable and readlinesable. + # For example, ReadableS3File is not a seekable file. return load_dill_with_pandas_backward_compatibility(BytesIO(file.read())) return load_dill_with_pandas_backward_compatibility(_ChunkedLargeFileReader(file)) diff --git a/gokart/utils.py b/gokart/utils.py index 0d6d6617..29d89181 100644 --- a/gokart/utils.py +++ b/gokart/utils.py @@ -14,6 +14,10 @@ def read(self, n: int) -> bytes: ... def readline(self) -> bytes: ... + def seek(self, offset: int) -> None: ... + + def seekable(self) -> bool: ... + def add_config(file_path: str): _, ext = os.path.splitext(file_path) @@ -27,8 +31,6 @@ def add_config(file_path: str): FlattenableItems: TypeAlias = T | Iterable['FlattenableItems[T]'] | dict[str, 'FlattenableItems[T]'] else: - from typing import Union - FlattenableItems = Union[T, Iterable['FlattenableItems[T]'], dict[str, 'FlattenableItems[T]']] @@ -74,6 +76,8 @@ def load_dill_with_pandas_backward_compatibility(file: FileLike) -> Any: It is unclear whether all objects dumped by dill can be loaded by pd.read_pickle, we use dill.load as a fallback. """ try: - return pd.read_pickle(file) - except Exception: return dill.load(file) + except Exception: + assert file.seekable(), f'{file} is not seekable.' + file.seek(0) + return pd.read_pickle(file)