Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REFACTOR-#5024: Make _row_lengths and _column_widths public #5025

Merged
merged 1 commit into from
Sep 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/release_notes/release_notes-0.16.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ Key Features and Updates
* REFACTOR-#4970: Rewrite base implementations of a partition' `width/length` (#4971)
* REFACTOR-#4942: Remove `call` method in favor of `register` due to duplication (4943)
* REFACTOR-#4922: Helpers for take_2d_labels_or_positional (#4865)
* REFACTOR-#5024: Make `_row_lengths` and `_column_widths` public (#5025)
* REFACTOR-#5009: Use `RayWrapper.materialize` instead of `ray.get` (#5010)
* REFACTOR-#4755: Rewrite Pandas version mismatch warning (#4965)
* Pandas API implementations and improvements
Expand Down
82 changes: 40 additions & 42 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def _validate_axes_lengths(self):
)

@property
def _row_lengths(self):
def row_lengths(self):
"""
Compute the row partitions lengths if they are not cached.

Expand All @@ -261,7 +261,7 @@ def _row_lengths(self):
return self._row_lengths_cache

@property
def _column_widths(self):
def column_widths(self):
"""
Compute the column partitions widths if they are not cached.

Expand Down Expand Up @@ -295,7 +295,7 @@ def _axes_lengths(self):
list
The pair of row partitions lengths and column partitions widths.
"""
return [self._row_lengths, self._column_widths]
return [self.row_lengths, self.column_widths]

@property
def dtypes(self):
Expand Down Expand Up @@ -500,14 +500,14 @@ def _filter_empties(self, compute_metadata=True):
[
self._partitions[i][j]
for j in range(len(self._partitions[i]))
if j < len(self._column_widths) and self._column_widths[j] != 0
if j < len(self.column_widths) and self.column_widths[j] != 0
]
for i in range(len(self._partitions))
if i < len(self._row_lengths) and self._row_lengths[i] != 0
if i < len(self.row_lengths) and self.row_lengths[i] != 0
]
)
self._column_widths_cache = [w for w in self._column_widths if w != 0]
self._row_lengths_cache = [r for r in self._row_lengths if r != 0]
self._column_widths_cache = [w for w in self.column_widths if w != 0]
self._row_lengths_cache = [r for r in self.row_lengths if r != 0]

def synchronize_labels(self, axis=None):
"""
Expand Down Expand Up @@ -541,9 +541,9 @@ def _propagate_index_objs(self, axis=None):
"""
self._filter_empties()
if axis is None or axis == 0:
cum_row_lengths = np.cumsum([0] + self._row_lengths)
cum_row_lengths = np.cumsum([0] + self.row_lengths)
if axis is None or axis == 1:
cum_col_widths = np.cumsum([0] + self._column_widths)
cum_col_widths = np.cumsum([0] + self.column_widths)

if axis is None:

Expand All @@ -563,8 +563,8 @@ def apply_idx_objs(df, idx, cols):
cols=self.columns[
slice(cum_col_widths[j], cum_col_widths[j + 1])
],
length=self._row_lengths[i],
width=self._column_widths[j],
length=self.row_lengths[i],
width=self.column_widths[j],
)
for j in range(len(self._partitions[i]))
]
Expand All @@ -586,8 +586,8 @@ def apply_idx_objs(df, idx):
idx=self.index[
slice(cum_row_lengths[i], cum_row_lengths[i + 1])
],
length=self._row_lengths[i],
width=self._column_widths[j],
length=self.row_lengths[i],
width=self.column_widths[j],
)
for j in range(len(self._partitions[i]))
]
Expand All @@ -608,8 +608,8 @@ def apply_idx_objs(df, cols):
cols=self.columns[
slice(cum_col_widths[j], cum_col_widths[j + 1])
],
length=self._row_lengths[i],
width=self._column_widths[j],
length=self.row_lengths[i],
width=self.column_widths[j],
)
for j in range(len(self._partitions[i]))
]
Expand Down Expand Up @@ -709,9 +709,9 @@ def _get_new_lengths(self, partitions_dict, *, axis: int) -> List[int]:
"""
# Helper for take_2d_positional
if axis == 0:
axis_lengths = self._row_lengths
axis_lengths = self.row_lengths
else:
axis_lengths = self._column_widths
axis_lengths = self.column_widths

new_lengths = [
len(
Expand Down Expand Up @@ -836,7 +836,7 @@ def _take_2d_positional(
ErrorMessage.catch_bugs_and_request_email(
failure_condition=sum(new_col_widths) != len(new_columns),
extra_log=f"{sum(new_col_widths)} != {len(new_columns)}.\n"
+ f"{col_positions}\n{self._column_widths}\n{col_partitions_dict}",
+ f"{col_positions}\n{self.column_widths}\n{col_partitions_dict}",
)

if self._dtypes is not None:
Expand Down Expand Up @@ -1026,8 +1026,8 @@ def from_labels_executor(df, **kwargs):
keep_remaining=True,
)
new_column_widths = [
self.index.nlevels + self._column_widths[0]
] + self._column_widths[1:]
self.index.nlevels + self.column_widths[0]
] + self.column_widths[1:]
result = self.__constructor__(
new_parts,
new_row_labels,
Expand Down Expand Up @@ -1174,8 +1174,8 @@ def astype_builder(df):
new_frame,
self.index,
self.columns,
self._row_lengths,
self._column_widths,
self.row_lengths,
self.column_widths,
new_dtypes,
)

Expand Down Expand Up @@ -1366,9 +1366,9 @@ def _get_dict_of_block_index(self, axis, indices, are_indices_sorted=False):
if has_negative or not are_indices_sorted:
indices = np.sort(indices)
if axis == 0:
bins = np.array(self._row_lengths)
bins = np.array(self.row_lengths)
else:
bins = np.array(self._column_widths)
bins = np.array(self.column_widths)
# INT_MAX to make sure we don't try to compute on partitions that don't exist.
cumulative = np.append(bins[:-1].cumsum(), np.iinfo(bins.dtype).max)

Expand Down Expand Up @@ -1759,8 +1759,8 @@ def fold(self, axis, func):
new_partitions,
self.index,
self.columns,
self._row_lengths,
self._column_widths,
self.row_lengths,
self.column_widths,
)

def infer_objects(self) -> "PandasDataframe":
Expand Down Expand Up @@ -1804,8 +1804,8 @@ def infer_types(self, col_labels: List[str]) -> "PandasDataframe":
self._partitions,
self.index,
self.columns,
self._row_lengths,
self._column_widths,
self.row_lengths,
self.column_widths,
new_dtypes,
)

Expand Down Expand Up @@ -1910,8 +1910,8 @@ def map_fn(df):
new_parts,
new_index,
new_cols,
self._row_lengths,
self._column_widths,
self.row_lengths,
self.column_widths,
new_dtypes,
)

Expand Down Expand Up @@ -2218,8 +2218,8 @@ def apply_select_indices(
lengths_objs = {
axis: [len(apply_indices)]
if not keep_remaining
else [self._row_lengths, self._column_widths][axis],
axis ^ 1: [self._row_lengths, self._column_widths][axis ^ 1],
else [self.row_lengths, self.column_widths][axis],
axis ^ 1: [self.row_lengths, self.column_widths][axis ^ 1],
}
return self.__constructor__(
new_partitions, new_index, new_columns, lengths_objs[0], lengths_objs[1]
Expand Down Expand Up @@ -2352,7 +2352,7 @@ def _prepare_frame_to_broadcast(self, axis, indices, broadcast_all):
broadcast [self[key1], self[key2]] partitions and internal indices for `self` must be [[0, 1], [5]]
"""
if broadcast_all:
sizes = self._row_lengths if axis else self._column_widths
sizes = self.row_lengths if axis else self.column_widths
return {key: dict(enumerate(sizes)) for key in indices.keys()}
passed_len = 0
result_dict = {}
Expand Down Expand Up @@ -2622,9 +2622,7 @@ def _copartition(self, axis, other, how, sort, force_repartition=False):
base_lengths = [obj.length() for obj in reindexed_base.T[0]]
else:
reindexed_base = base_frame._partitions
base_lengths = (
base_frame._column_widths if axis else base_frame._row_lengths
)
base_lengths = base_frame.column_widths if axis else base_frame.row_lengths

others_lengths = [o._axes_lengths[axis] for o in other_frames]

Expand Down Expand Up @@ -2686,15 +2684,15 @@ def n_ary_op(self, op, right_frames: list, join_type="outer"):
0, right_frames, join_type, sort=True
)
new_left_frame = self.__constructor__(
left_parts, joined_index, self.columns, row_lengths, self._column_widths
left_parts, joined_index, self.columns, row_lengths, self.column_widths
)
new_right_frames = [
self.__constructor__(
right_parts,
joined_index,
right_frame.columns,
row_lengths,
right_frame._column_widths,
right_frame.column_widths,
)
for right_parts, right_frame in zip(list_of_right_parts, right_frames)
]
Expand Down Expand Up @@ -2768,7 +2766,7 @@ def _compute_new_widths():
if (
axis == Axis.ROW_WISE
and all(o.columns.equals(self.columns) for o in others)
and all(o._column_widths == self._column_widths for o in others)
and all(o.column_widths == self.column_widths for o in others)
):
joined_index = self.columns
left_parts = self._partitions
Expand All @@ -2777,7 +2775,7 @@ def _compute_new_widths():
elif (
axis == Axis.COL_WISE
and all(o.index.equals(self.index) for o in others)
and all(o._row_lengths == self._row_lengths for o in others)
and all(o.row_lengths == self.row_lengths for o in others)
):
joined_index = self.index
left_parts = self._partitions
Expand Down Expand Up @@ -3109,8 +3107,8 @@ def transpose(self):
new_partitions,
self.columns,
self.index,
self._column_widths,
self._row_lengths,
self.column_widths,
self.row_lengths,
dtypes=new_dtypes,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def get_chunks(
cur_n_chunks = self.num_chunks()
n_rows = self.size()
if n_chunks is None or n_chunks == cur_n_chunks:
cum_row_lengths = np.cumsum([0] + self._col._row_lengths)
cum_row_lengths = np.cumsum([0] + self._col.row_lengths)
for i in range(len(cum_row_lengths) - 1):
yield PandasProtocolColumn(
self._col.take_2d_labels_or_positional(
Expand Down Expand Up @@ -304,9 +304,9 @@ def get_chunks(
self._col.index,
self._col.columns,
new_lengths,
self._col._column_widths,
self._col.column_widths,
)
cum_row_lengths = np.cumsum([0] + new_df._row_lengths)
cum_row_lengths = np.cumsum([0] + new_df.row_lengths)
for i in range(len(cum_row_lengths) - 1):
yield PandasProtocolColumn(
new_df.take_2d_labels_or_positional(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def get_chunks(
cur_n_chunks = self.num_chunks()
n_rows = self.num_rows()
if n_chunks is None or n_chunks == cur_n_chunks:
cum_row_lengths = np.cumsum([0] + self._df._row_lengths)
cum_row_lengths = np.cumsum([0] + self._df.row_lengths)
for i in range(len(cum_row_lengths) - 1):
yield PandasProtocolDataframe(
self._df.take_2d_labels_or_positional(
Expand Down Expand Up @@ -188,9 +188,9 @@ def get_chunks(
self._df.index,
self._df.columns,
new_lengths,
self._df._column_widths,
self._df.column_widths,
)
cum_row_lengths = np.cumsum([0] + new_df._row_lengths)
cum_row_lengths = np.cumsum([0] + new_df.row_lengths)
for i in range(len(cum_row_lengths) - 1):
yield PandasProtocolDataframe(
new_df.take_2d_labels_or_positional(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ def synchronize_labels(self, axis=None):
axis is not None and axis not in [0, 1]
)

cum_row_lengths = np.cumsum([0] + self._row_lengths)
cum_col_widths = np.cumsum([0] + self._column_widths)
cum_row_lengths = np.cumsum([0] + self.row_lengths)
cum_col_widths = np.cumsum([0] + self.column_widths)

def apply_idx_objs(df, idx, cols, axis):
# cudf does not support set_axis. It only supports rename with 1-to-1 mapping.
Expand Down Expand Up @@ -180,7 +180,7 @@ def take_2d_labels_or_positional(
# on the partition. Often this will be the same length as the current
# length, but sometimes it is different, thus the extra calculation.
new_row_lengths = [
len(range(*idx.indices(self._row_lengths[p])))
len(range(*idx.indices(self.row_lengths[p])))
for p, idx in row_partitions_list.items()
]
# Use the slice to calculate the new row index
Expand All @@ -189,10 +189,8 @@ def take_2d_labels_or_positional(
new_row_lengths = [len(idx) for _, idx in row_partitions_list.items()]
new_index = self.index[sorted(row_positions)]
else:
row_partitions_list = {
i: slice(None) for i in range(len(self._row_lengths))
}
new_row_lengths = self._row_lengths
row_partitions_list = {i: slice(None) for i in range(len(self.row_lengths))}
new_row_lengths = self.row_lengths
new_index = self.index

if col_labels is not None:
Expand All @@ -204,7 +202,7 @@ def take_2d_labels_or_positional(
# on the partition. Often this will be the same length as the current
# length, but sometimes it is different, thus the extra calculation.
new_col_widths = [
len(range(*idx.indices(self._column_widths[p])))
len(range(*idx.indices(self.column_widths[p])))
for p, idx in col_partitions_list.items()
]
# Use the slice to calculate the new columns
Expand All @@ -215,7 +213,7 @@ def take_2d_labels_or_positional(
sum(new_col_widths),
len(new_columns),
col_positions,
self._column_widths,
self.column_widths,
col_partitions_list,
)
if self._dtypes is not None:
Expand All @@ -231,9 +229,9 @@ def take_2d_labels_or_positional(
new_dtypes = None
else:
col_partitions_list = {
i: slice(None) for i in range(len(self._column_widths))
i: slice(None) for i in range(len(self.column_widths))
}
new_col_widths = self._column_widths
new_col_widths = self.column_widths
new_columns = self.columns
if self._dtypes is not None:
new_dtypes = self.dtypes
Expand Down
2 changes: 1 addition & 1 deletion modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,7 @@ def reset_index(self, **kwargs):
new_self = self.copy()
new_self.index = (
# Cheaper to compute row lengths than index
pandas.RangeIndex(sum(new_self._modin_frame._row_lengths))
pandas.RangeIndex(sum(new_self._modin_frame.row_lengths))
if new_index is None
else new_index
)
Expand Down
2 changes: 1 addition & 1 deletion modin/experimental/xgboost/xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def __init__(
self.metadata = (
data.index,
data.columns,
data._query_compiler._modin_frame._row_lengths,
data._query_compiler._modin_frame.row_lengths,
)

def __iter__(self):
Expand Down
4 changes: 2 additions & 2 deletions modin/test/storage_formats/pandas/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ def test_apply_func_to_both_axis(has_partitions_shape_cache, has_frame_shape_cac

if has_frame_shape_cache:
# Explicitly compute rows & columns shapes to store this info in frame's cache
modin_frame._row_lengths
modin_frame._column_widths
modin_frame.row_lengths
modin_frame.column_widths
else:
# Explicitly reset frame's cache
modin_frame._row_lengths_cache = None
Expand Down