Skip to content

Commit

Permalink
FIX-#7238: Fix docstring inheritance for cached_property and use it (
Browse files Browse the repository at this point in the history
…#7239)

Signed-off-by: Anatoly Myachev <[email protected]>
  • Loading branch information
anmyachev authored May 8, 2024
1 parent df81f3a commit 06699a8
Show file tree
Hide file tree
Showing 11 changed files with 107 additions and 128 deletions.
8 changes: 6 additions & 2 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@
PandasDataframe is a parent abstract class for any dataframe class
for pandas storage format.
"""

from __future__ import annotations

import datetime
import re
from functools import cached_property
from typing import TYPE_CHECKING, Callable, Dict, Hashable, List, Optional, Union

import numpy as np
Expand Down Expand Up @@ -98,8 +102,8 @@ class PandasDataframe(
_deferred_index = False
_deferred_column = False

@pandas.util.cache_readonly
def __constructor__(self):
@cached_property
def __constructor__(self) -> type[PandasDataframe]:
"""
Create a new instance of this object.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
this is worth looking at again.
"""

from __future__ import annotations

from functools import cached_property
from typing import Any, Dict, Iterable, Optional, Tuple

import numpy as np
Expand Down Expand Up @@ -118,15 +121,8 @@ def size(self) -> int:
def offset(self) -> int:
return 0

_dtype_cache = None

# TODO: since python 3.9:
# @cached_property
@property
@cached_property
def dtype(self) -> Tuple[DTypeKind, int, str, str]:
if self._dtype_cache is not None:
return self._dtype_cache

dtype = self._col.dtypes.iloc[0]

if isinstance(dtype, pandas.CategoricalDtype):
Expand All @@ -149,8 +145,7 @@ def dtype(self) -> Tuple[DTypeKind, int, str, str]:
else:
dtype_cache = self._dtype_from_primitive_pandas_dtype(dtype)

self._dtype_cache = dtype_cache
return self._dtype_cache
return dtype_cache

def _dtype_from_primitive_pandas_dtype(
self, dtype
Expand Down Expand Up @@ -228,14 +223,8 @@ def describe_null(self) -> Tuple[int, Any]:

return null, value

_null_count_cache = None

# TODO: since python 3.9:
# @cached_property
@property
@cached_property
def null_count(self) -> int:
if self._null_count_cache is not None:
return self._null_count_cache

def map_func(df):
return df.isna()
Expand All @@ -252,8 +241,7 @@ def reduce_func(df):
# Otherwise, we get mismatching internal and external indices for both axes
intermediate_df.index = pandas.RangeIndex(1)
intermediate_df.columns = pandas.RangeIndex(1)
self._null_count_cache = intermediate_df.to_pandas().squeeze(axis=1).item()
return self._null_count_cache
return intermediate_df.to_pandas().squeeze(axis=1).item()

@property
def metadata(self) -> Dict[str, Any]:
Expand Down
8 changes: 5 additions & 3 deletions modin/core/dataframe/pandas/partitioning/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,16 @@

"""The module defines base interface for a partition of a Modin DataFrame."""

from __future__ import annotations

import logging
import uuid
from abc import ABC
from copy import copy
from functools import cached_property

import pandas
from pandas.api.types import is_scalar
from pandas.util import cache_readonly

from modin.core.storage_formats.pandas.utils import length_fn_pandas, width_fn_pandas
from modin.logging import ClassLogger, get_logger
Expand Down Expand Up @@ -60,8 +62,8 @@ def __init__(self):
else:
type(self)._iloc_func = staticmethod(self._iloc)

@cache_readonly
def __constructor__(self):
@cached_property
def __constructor__(self) -> type[PandasDataframePartition]:
"""
Create a new instance of this object.
Expand Down
78 changes: 32 additions & 46 deletions modin/core/io/column_stores/parquet_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@

"""Module houses `ParquetDispatcher` class, that is used for reading `.parquet` files."""

from __future__ import annotations

import functools
import json
import os
import re
Expand Down Expand Up @@ -58,11 +61,6 @@ class ColumnStoreDataset:
dataset : ParquetDataset or ParquetFile
Underlying dataset implementation for PyArrow and fastparquet
respectively.
_row_groups_per_file : list
List that contains the number of row groups for each file in the
given parquet dataset.
_files : list
List that contains the full paths of the parquet files in the dataset.
"""

def __init__(self, path, storage_options): # noqa : PR01
Expand All @@ -71,8 +69,6 @@ def __init__(self, path, storage_options): # noqa : PR01
self._fs_path = None
self._fs = None
self.dataset = self._init_dataset()
self._row_groups_per_file = None
self._files = None

@property
def pandas_metadata(self):
Expand All @@ -89,14 +85,12 @@ def engine(self):
"""Return string representing what engine is being used."""
raise NotImplementedError

# TODO: make this cache_readonly after docstring inheritance is fixed.
@property
@functools.cached_property
def files(self):
"""Return the list of formatted file paths of the dataset."""
raise NotImplementedError

# TODO: make this cache_readonly after docstring inheritance is fixed.
@property
@functools.cached_property
def row_groups_per_file(self):
"""Return a list with the number of row groups per file."""
raise NotImplementedError
Expand Down Expand Up @@ -201,31 +195,27 @@ def columns(self):
def engine(self):
return "pyarrow"

@property
@functools.cached_property
def row_groups_per_file(self):
from pyarrow.parquet import ParquetFile

if self._row_groups_per_file is None:
row_groups_per_file = []
# Count up the total number of row groups across all files and
# keep track of row groups per file to use later.
for file in self.files:
with self.fs.open(file) as f:
row_groups = ParquetFile(f).num_row_groups
row_groups_per_file.append(row_groups)
self._row_groups_per_file = row_groups_per_file
return self._row_groups_per_file
row_groups_per_file = []
# Count up the total number of row groups across all files and
# keep track of row groups per file to use later.
for file in self.files:
with self.fs.open(file) as f:
row_groups = ParquetFile(f).num_row_groups
row_groups_per_file.append(row_groups)
return row_groups_per_file

@property
@functools.cached_property
def files(self):
if self._files is None:
try:
files = self.dataset.files
except AttributeError:
# compatibility at least with 3.0.0 <= pyarrow < 8.0.0
files = self.dataset._dataset.files
self._files = self._get_files(files)
return self._files
try:
files = self.dataset.files
except AttributeError:
# compatibility at least with 3.0.0 <= pyarrow < 8.0.0
files = self.dataset._dataset.files
return self._get_files(files)

def to_pandas_dataframe(
self,
Expand Down Expand Up @@ -259,26 +249,22 @@ def columns(self):
def engine(self):
return "fastparquet"

@property
@functools.cached_property
def row_groups_per_file(self):
from fastparquet import ParquetFile

if self._row_groups_per_file is None:
row_groups_per_file = []
# Count up the total number of row groups across all files and
# keep track of row groups per file to use later.
for file in self.files:
with self.fs.open(file) as f:
row_groups = ParquetFile(f).info["row_groups"]
row_groups_per_file.append(row_groups)
self._row_groups_per_file = row_groups_per_file
return self._row_groups_per_file
row_groups_per_file = []
# Count up the total number of row groups across all files and
# keep track of row groups per file to use later.
for file in self.files:
with self.fs.open(file) as f:
row_groups = ParquetFile(f).info["row_groups"]
row_groups_per_file.append(row_groups)
return row_groups_per_file

@property
@functools.cached_property
def files(self):
if self._files is None:
self._files = self._get_files(self._get_fastparquet_files())
return self._files
return self._get_files(self._get_fastparquet_files())

def to_pandas_dataframe(self, columns):
return self.dataset.to_pandas(columns=columns)
Expand Down
7 changes: 5 additions & 2 deletions modin/core/storage_formats/base/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,11 @@
``BaseQueryCompiler`` is a parent abstract class for any other query compiler class.
"""

from __future__ import annotations

import abc
import warnings
from functools import cached_property
from typing import Hashable, List, Optional

import numpy as np
Expand Down Expand Up @@ -4455,8 +4458,8 @@ def write_items(df, broadcasted_items):

# END Abstract methods for QueryCompiler

@pandas.util.cache_readonly
def __constructor__(self):
@cached_property
def __constructor__(self) -> type[BaseQueryCompiler]:
"""
Get query compiler constructor.
Expand Down
9 changes: 6 additions & 3 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,15 @@
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

"""Implement DataFrame/Series public API as pandas does."""

from __future__ import annotations

import pickle as pkl
import re
import warnings
from functools import cached_property
from typing import TYPE_CHECKING, Any, Hashable, Literal, Optional, Sequence, Union

import numpy as np
Expand Down Expand Up @@ -179,7 +182,7 @@ class BasePandasDataset(ClassLogger):
_pandas_class = pandas.core.generic.NDFrame
_query_compiler: BaseQueryCompiler

@pandas.util.cache_readonly
@cached_property
def _is_dataframe(self) -> bool:
"""
Tell whether this is a dataframe.
Expand Down Expand Up @@ -580,8 +583,8 @@ def _get_axis_number(cls, axis):

return cls._pandas_class._get_axis_number(axis) if axis is not None else 0

@pandas.util.cache_readonly
def __constructor__(self) -> BasePandasDataset:
@cached_property
def __constructor__(self) -> type[BasePandasDataset]:
"""
Construct DataFrame or Series object depending on self type.
Expand Down
3 changes: 1 addition & 2 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2615,9 +2615,8 @@ def __setattr__(self, key, value) -> None:
# - `_query_compiler`, which Modin initializes before it appears in
# __dict__
# - `_siblings`, which Modin initializes before it appears in __dict__
# - `_cache`, which pandas.cache_readonly uses to cache properties
# before it appears in __dict__.
if key in ("_query_compiler", "_siblings", "_cache") or key in self.__dict__:
if key in ("_query_compiler", "_siblings") or key in self.__dict__:
pass
# we have to check for the key in `dir(self)` first in order not to trigger columns computation
elif key not in dir(self) and key in self:
Expand Down
Loading

0 comments on commit 06699a8

Please sign in to comment.