FIX-#7238: Fix docstring inheritance for cached_property and use it (…

…#7239) Signed-off-by: Anatoly Myachev <[email protected]>
modin-project · May 8, 2024 · 06699a8 · 06699a8
1 parent df81f3a
commit 06699a8
Show file tree

Hide file tree

Showing 11 changed files with 107 additions and 128 deletions.
diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -17,8 +17,12 @@
 PandasDataframe is a parent abstract class for any dataframe class
 for pandas storage format.
 """
+
+from __future__ import annotations
+
 import datetime
 import re
+from functools import cached_property
 from typing import TYPE_CHECKING, Callable, Dict, Hashable, List, Optional, Union
 
 import numpy as np
@@ -98,8 +102,8 @@ class PandasDataframe(
     _deferred_index = False
     _deferred_column = False
 
-    @pandas.util.cache_readonly
-    def __constructor__(self):
+    @cached_property
+    def __constructor__(self) -> type[PandasDataframe]:
         """
         Create a new instance of this object.
 

diff --git a/modin/core/dataframe/pandas/interchange/dataframe_protocol/column.py b/modin/core/dataframe/pandas/interchange/dataframe_protocol/column.py
@@ -25,6 +25,9 @@
   this is worth looking at again.
 """
 
+from __future__ import annotations
+
+from functools import cached_property
 from typing import Any, Dict, Iterable, Optional, Tuple
 
 import numpy as np
@@ -118,15 +121,8 @@ def size(self) -> int:
     def offset(self) -> int:
         return 0
 
-    _dtype_cache = None
-
-    # TODO: since python 3.9:
-    # @cached_property
-    @property
+    @cached_property
     def dtype(self) -> Tuple[DTypeKind, int, str, str]:
-        if self._dtype_cache is not None:
-            return self._dtype_cache
-
         dtype = self._col.dtypes.iloc[0]
 
         if isinstance(dtype, pandas.CategoricalDtype):
@@ -149,8 +145,7 @@ def dtype(self) -> Tuple[DTypeKind, int, str, str]:
         else:
             dtype_cache = self._dtype_from_primitive_pandas_dtype(dtype)
 
-        self._dtype_cache = dtype_cache
-        return self._dtype_cache
+        return dtype_cache
 
     def _dtype_from_primitive_pandas_dtype(
         self, dtype
@@ -228,14 +223,8 @@ def describe_null(self) -> Tuple[int, Any]:
 
         return null, value
 
-    _null_count_cache = None
-
-    # TODO: since python 3.9:
-    # @cached_property
-    @property
+    @cached_property
     def null_count(self) -> int:
-        if self._null_count_cache is not None:
-            return self._null_count_cache
 
         def map_func(df):
             return df.isna()
@@ -252,8 +241,7 @@ def reduce_func(df):
         # Otherwise, we get mismatching internal and external indices for both axes
         intermediate_df.index = pandas.RangeIndex(1)
         intermediate_df.columns = pandas.RangeIndex(1)
-        self._null_count_cache = intermediate_df.to_pandas().squeeze(axis=1).item()
-        return self._null_count_cache
+        return intermediate_df.to_pandas().squeeze(axis=1).item()
 
     @property
     def metadata(self) -> Dict[str, Any]:

diff --git a/modin/core/dataframe/pandas/partitioning/partition.py b/modin/core/dataframe/pandas/partitioning/partition.py
@@ -13,14 +13,16 @@
 
 """The module defines base interface for a partition of a Modin DataFrame."""
 
+from __future__ import annotations
+
 import logging
 import uuid
 from abc import ABC
 from copy import copy
+from functools import cached_property
 
 import pandas
 from pandas.api.types import is_scalar
-from pandas.util import cache_readonly
 
 from modin.core.storage_formats.pandas.utils import length_fn_pandas, width_fn_pandas
 from modin.logging import ClassLogger, get_logger
@@ -60,8 +62,8 @@ def __init__(self):
             else:
                 type(self)._iloc_func = staticmethod(self._iloc)
 
-    @cache_readonly
-    def __constructor__(self):
+    @cached_property
+    def __constructor__(self) -> type[PandasDataframePartition]:
         """
         Create a new instance of this object.
 

diff --git a/modin/core/io/column_stores/parquet_dispatcher.py b/modin/core/io/column_stores/parquet_dispatcher.py
@@ -13,6 +13,9 @@
 
 """Module houses `ParquetDispatcher` class, that is used for reading `.parquet` files."""
 
+from __future__ import annotations
+
+import functools
 import json
 import os
 import re
@@ -58,11 +61,6 @@ class ColumnStoreDataset:
     dataset : ParquetDataset or ParquetFile
         Underlying dataset implementation for PyArrow and fastparquet
         respectively.
-    _row_groups_per_file : list
-        List that contains the number of row groups for each file in the
-        given parquet dataset.
-    _files : list
-        List that contains the full paths of the parquet files in the dataset.
     """
 
     def __init__(self, path, storage_options):  # noqa : PR01
@@ -71,8 +69,6 @@ def __init__(self, path, storage_options):  # noqa : PR01
         self._fs_path = None
         self._fs = None
         self.dataset = self._init_dataset()
-        self._row_groups_per_file = None
-        self._files = None
 
     @property
     def pandas_metadata(self):
@@ -89,14 +85,12 @@ def engine(self):
         """Return string representing what engine is being used."""
         raise NotImplementedError
 
-    # TODO: make this cache_readonly after docstring inheritance is fixed.
-    @property
+    @functools.cached_property
     def files(self):
         """Return the list of formatted file paths of the dataset."""
         raise NotImplementedError
 
-    # TODO: make this cache_readonly after docstring inheritance is fixed.
-    @property
+    @functools.cached_property
     def row_groups_per_file(self):
         """Return a list with the number of row groups per file."""
         raise NotImplementedError
@@ -201,31 +195,27 @@ def columns(self):
     def engine(self):
         return "pyarrow"
 
-    @property
+    @functools.cached_property
     def row_groups_per_file(self):
         from pyarrow.parquet import ParquetFile
 
-        if self._row_groups_per_file is None:
-            row_groups_per_file = []
-            # Count up the total number of row groups across all files and
-            # keep track of row groups per file to use later.
-            for file in self.files:
-                with self.fs.open(file) as f:
-                    row_groups = ParquetFile(f).num_row_groups
-                    row_groups_per_file.append(row_groups)
-            self._row_groups_per_file = row_groups_per_file
-        return self._row_groups_per_file
+        row_groups_per_file = []
+        # Count up the total number of row groups across all files and
+        # keep track of row groups per file to use later.
+        for file in self.files:
+            with self.fs.open(file) as f:
+                row_groups = ParquetFile(f).num_row_groups
+                row_groups_per_file.append(row_groups)
+        return row_groups_per_file
 
-    @property
+    @functools.cached_property
     def files(self):
-        if self._files is None:
-            try:
-                files = self.dataset.files
-            except AttributeError:
-                # compatibility at least with 3.0.0 <= pyarrow < 8.0.0
-                files = self.dataset._dataset.files
-            self._files = self._get_files(files)
-        return self._files
+        try:
+            files = self.dataset.files
+        except AttributeError:
+            # compatibility at least with 3.0.0 <= pyarrow < 8.0.0
+            files = self.dataset._dataset.files
+        return self._get_files(files)
 
     def to_pandas_dataframe(
         self,
@@ -259,26 +249,22 @@ def columns(self):
     def engine(self):
         return "fastparquet"
 
-    @property
+    @functools.cached_property
     def row_groups_per_file(self):
         from fastparquet import ParquetFile
 
-        if self._row_groups_per_file is None:
-            row_groups_per_file = []
-            # Count up the total number of row groups across all files and
-            # keep track of row groups per file to use later.
-            for file in self.files:
-                with self.fs.open(file) as f:
-                    row_groups = ParquetFile(f).info["row_groups"]
-                    row_groups_per_file.append(row_groups)
-            self._row_groups_per_file = row_groups_per_file
-        return self._row_groups_per_file
+        row_groups_per_file = []
+        # Count up the total number of row groups across all files and
+        # keep track of row groups per file to use later.
+        for file in self.files:
+            with self.fs.open(file) as f:
+                row_groups = ParquetFile(f).info["row_groups"]
+                row_groups_per_file.append(row_groups)
+        return row_groups_per_file
 
-    @property
+    @functools.cached_property
     def files(self):
-        if self._files is None:
-            self._files = self._get_files(self._get_fastparquet_files())
-        return self._files
+        return self._get_files(self._get_fastparquet_files())
 
     def to_pandas_dataframe(self, columns):
         return self.dataset.to_pandas(columns=columns)

diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py
@@ -17,8 +17,11 @@
 ``BaseQueryCompiler`` is a parent abstract class for any other query compiler class.
 """
 
+from __future__ import annotations
+
 import abc
 import warnings
+from functools import cached_property
 from typing import Hashable, List, Optional
 
 import numpy as np
@@ -4455,8 +4458,8 @@ def write_items(df, broadcasted_items):
 
     # END Abstract methods for QueryCompiler
 
-    @pandas.util.cache_readonly
-    def __constructor__(self):
+    @cached_property
+    def __constructor__(self) -> type[BaseQueryCompiler]:
         """
         Get query compiler constructor.
 

diff --git a/modin/pandas/base.py b/modin/pandas/base.py
@@ -10,12 +10,15 @@
 # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific language
 # governing permissions and limitations under the License.
+
 """Implement DataFrame/Series public API as pandas does."""
+
 from __future__ import annotations
 
 import pickle as pkl
 import re
 import warnings
+from functools import cached_property
 from typing import TYPE_CHECKING, Any, Hashable, Literal, Optional, Sequence, Union
 
 import numpy as np
@@ -179,7 +182,7 @@ class BasePandasDataset(ClassLogger):
     _pandas_class = pandas.core.generic.NDFrame
     _query_compiler: BaseQueryCompiler
 
-    @pandas.util.cache_readonly
+    @cached_property
     def _is_dataframe(self) -> bool:
         """
         Tell whether this is a dataframe.
@@ -580,8 +583,8 @@ def _get_axis_number(cls, axis):
 
         return cls._pandas_class._get_axis_number(axis) if axis is not None else 0
 
-    @pandas.util.cache_readonly
-    def __constructor__(self) -> BasePandasDataset:
+    @cached_property
+    def __constructor__(self) -> type[BasePandasDataset]:
         """
         Construct DataFrame or Series object depending on self type.
 

diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
@@ -2615,9 +2615,8 @@ def __setattr__(self, key, value) -> None:
         # - `_query_compiler`, which Modin initializes before it appears in
         #   __dict__
         # - `_siblings`, which Modin initializes before it appears in __dict__
-        # - `_cache`, which pandas.cache_readonly uses to cache properties
         #   before it appears in __dict__.
-        if key in ("_query_compiler", "_siblings", "_cache") or key in self.__dict__:
+        if key in ("_query_compiler", "_siblings") or key in self.__dict__:
             pass
         # we have to check for the key in `dir(self)` first in order not to trigger columns computation
         elif key not in dir(self) and key in self: