modin-project · mvashishtha · Sep 25, 2022 · Sep 23, 2022
@@ -110,6 +110,7 @@ Key Features and Updates
   * REFACTOR-#4970: Rewrite base implementations of a partition' `width/length` (#4971)  
   * REFACTOR-#4942: Remove `call` method in favor of `register` due to duplication (4943)
   * REFACTOR-#4922: Helpers for take_2d_labels_or_positional (#4865)
+  * REFACTOR-#5024: Make `_row_lengths` and `_column_widths` public (#5025)
   * REFACTOR-#5009: Use `RayWrapper.materialize` instead of `ray.get` (#5010)
   * REFACTOR-#4755: Rewrite Pandas version mismatch warning (#4965)
 * Pandas API implementations and improvements

@@ -236,7 +236,7 @@ def _validate_axes_lengths(self):
             )
 
     @property
-    def _row_lengths(self):
+    def row_lengths(self):
         """
         Compute the row partitions lengths if they are not cached.
 
@@ -261,7 +261,7 @@ def _row_lengths(self):
         return self._row_lengths_cache
 
     @property
-    def _column_widths(self):
+    def column_widths(self):
         """
         Compute the column partitions widths if they are not cached.
 
@@ -295,7 +295,7 @@ def _axes_lengths(self):
         list
             The pair of row partitions lengths and column partitions widths.
         """
-        return [self._row_lengths, self._column_widths]
+        return [self.row_lengths, self.column_widths]
 
     @property
     def dtypes(self):
@@ -500,14 +500,14 @@ def _filter_empties(self, compute_metadata=True):
                 [
                     self._partitions[i][j]
                     for j in range(len(self._partitions[i]))
-                    if j < len(self._column_widths) and self._column_widths[j] != 0
+                    if j < len(self.column_widths) and self.column_widths[j] != 0
                 ]
                 for i in range(len(self._partitions))
-                if i < len(self._row_lengths) and self._row_lengths[i] != 0
+                if i < len(self.row_lengths) and self.row_lengths[i] != 0
             ]
         )
-        self._column_widths_cache = [w for w in self._column_widths if w != 0]
-        self._row_lengths_cache = [r for r in self._row_lengths if r != 0]
+        self._column_widths_cache = [w for w in self.column_widths if w != 0]
+        self._row_lengths_cache = [r for r in self.row_lengths if r != 0]
 
     def synchronize_labels(self, axis=None):
         """
@@ -541,9 +541,9 @@ def _propagate_index_objs(self, axis=None):
         """
         self._filter_empties()
         if axis is None or axis == 0:
-            cum_row_lengths = np.cumsum([0] + self._row_lengths)
+            cum_row_lengths = np.cumsum([0] + self.row_lengths)
         if axis is None or axis == 1:
-            cum_col_widths = np.cumsum([0] + self._column_widths)
+            cum_col_widths = np.cumsum([0] + self.column_widths)
 
         if axis is None:
 
@@ -563,8 +563,8 @@ def apply_idx_objs(df, idx, cols):
                             cols=self.columns[
                                 slice(cum_col_widths[j], cum_col_widths[j + 1])
                             ],
-                            length=self._row_lengths[i],
-                            width=self._column_widths[j],
+                            length=self.row_lengths[i],
+                            width=self.column_widths[j],
                         )
                         for j in range(len(self._partitions[i]))
                     ]
@@ -586,8 +586,8 @@ def apply_idx_objs(df, idx):
                             idx=self.index[
                                 slice(cum_row_lengths[i], cum_row_lengths[i + 1])
                             ],
-                            length=self._row_lengths[i],
-                            width=self._column_widths[j],
+                            length=self.row_lengths[i],
+                            width=self.column_widths[j],
                         )
                         for j in range(len(self._partitions[i]))
                     ]
@@ -608,8 +608,8 @@ def apply_idx_objs(df, cols):
                             cols=self.columns[
                                 slice(cum_col_widths[j], cum_col_widths[j + 1])
                             ],
-                            length=self._row_lengths[i],
-                            width=self._column_widths[j],
+                            length=self.row_lengths[i],
+                            width=self.column_widths[j],
                         )
                         for j in range(len(self._partitions[i]))
                     ]
@@ -709,9 +709,9 @@ def _get_new_lengths(self, partitions_dict, *, axis: int) -> List[int]:
         """
         # Helper for take_2d_positional
         if axis == 0:
-            axis_lengths = self._row_lengths
+            axis_lengths = self.row_lengths
         else:
-            axis_lengths = self._column_widths
+            axis_lengths = self.column_widths
 
         new_lengths = [
             len(
@@ -836,7 +836,7 @@ def _take_2d_positional(
             ErrorMessage.catch_bugs_and_request_email(
                 failure_condition=sum(new_col_widths) != len(new_columns),
                 extra_log=f"{sum(new_col_widths)} != {len(new_columns)}.\n"
-                + f"{col_positions}\n{self._column_widths}\n{col_partitions_dict}",
+                + f"{col_positions}\n{self.column_widths}\n{col_partitions_dict}",
             )
 
             if self._dtypes is not None:
@@ -1026,8 +1026,8 @@ def from_labels_executor(df, **kwargs):
             keep_remaining=True,
         )
         new_column_widths = [
-            self.index.nlevels + self._column_widths[0]
-        ] + self._column_widths[1:]
+            self.index.nlevels + self.column_widths[0]
+        ] + self.column_widths[1:]
         result = self.__constructor__(
             new_parts,
             new_row_labels,
@@ -1174,8 +1174,8 @@ def astype_builder(df):
             new_frame,
             self.index,
             self.columns,
-            self._row_lengths,
-            self._column_widths,
+            self.row_lengths,
+            self.column_widths,
             new_dtypes,
         )
 
@@ -1366,9 +1366,9 @@ def _get_dict_of_block_index(self, axis, indices, are_indices_sorted=False):
         if has_negative or not are_indices_sorted:
             indices = np.sort(indices)
         if axis == 0:
-            bins = np.array(self._row_lengths)
+            bins = np.array(self.row_lengths)
         else:
-            bins = np.array(self._column_widths)
+            bins = np.array(self.column_widths)
         # INT_MAX to make sure we don't try to compute on partitions that don't exist.
         cumulative = np.append(bins[:-1].cumsum(), np.iinfo(bins.dtype).max)
 
@@ -1759,8 +1759,8 @@ def fold(self, axis, func):
             new_partitions,
             self.index,
             self.columns,
-            self._row_lengths,
-            self._column_widths,
+            self.row_lengths,
+            self.column_widths,
         )
 
     def infer_objects(self) -> "PandasDataframe":
@@ -1804,8 +1804,8 @@ def infer_types(self, col_labels: List[str]) -> "PandasDataframe":
             self._partitions,
             self.index,
             self.columns,
-            self._row_lengths,
-            self._column_widths,
+            self.row_lengths,
+            self.column_widths,
             new_dtypes,
         )
 
@@ -1910,8 +1910,8 @@ def map_fn(df):
             new_parts,
             new_index,
             new_cols,
-            self._row_lengths,
-            self._column_widths,
+            self.row_lengths,
+            self.column_widths,
             new_dtypes,
         )
 
@@ -2218,8 +2218,8 @@ def apply_select_indices(
             lengths_objs = {
                 axis: [len(apply_indices)]
                 if not keep_remaining
-                else [self._row_lengths, self._column_widths][axis],
-                axis ^ 1: [self._row_lengths, self._column_widths][axis ^ 1],
+                else [self.row_lengths, self.column_widths][axis],
+                axis ^ 1: [self.row_lengths, self.column_widths][axis ^ 1],
             }
             return self.__constructor__(
                 new_partitions, new_index, new_columns, lengths_objs[0], lengths_objs[1]
@@ -2352,7 +2352,7 @@ def _prepare_frame_to_broadcast(self, axis, indices, broadcast_all):
         broadcast [self[key1], self[key2]] partitions and internal indices for `self` must be [[0, 1], [5]]
         """
         if broadcast_all:
-            sizes = self._row_lengths if axis else self._column_widths
+            sizes = self.row_lengths if axis else self.column_widths
             return {key: dict(enumerate(sizes)) for key in indices.keys()}
         passed_len = 0
         result_dict = {}
@@ -2622,9 +2622,7 @@ def _copartition(self, axis, other, how, sort, force_repartition=False):
                 base_lengths = [obj.length() for obj in reindexed_base.T[0]]
         else:
             reindexed_base = base_frame._partitions
-            base_lengths = (
-                base_frame._column_widths if axis else base_frame._row_lengths
-            )
+            base_lengths = base_frame.column_widths if axis else base_frame.row_lengths
 
         others_lengths = [o._axes_lengths[axis] for o in other_frames]
 
@@ -2686,15 +2684,15 @@ def n_ary_op(self, op, right_frames: list, join_type="outer"):
             0, right_frames, join_type, sort=True
         )
         new_left_frame = self.__constructor__(
-            left_parts, joined_index, self.columns, row_lengths, self._column_widths
+            left_parts, joined_index, self.columns, row_lengths, self.column_widths
         )
         new_right_frames = [
             self.__constructor__(
                 right_parts,
                 joined_index,
                 right_frame.columns,
                 row_lengths,
-                right_frame._column_widths,
+                right_frame.column_widths,
             )
             for right_parts, right_frame in zip(list_of_right_parts, right_frames)
         ]
@@ -2768,7 +2766,7 @@ def _compute_new_widths():
         if (
             axis == Axis.ROW_WISE
             and all(o.columns.equals(self.columns) for o in others)
-            and all(o._column_widths == self._column_widths for o in others)
+            and all(o.column_widths == self.column_widths for o in others)
         ):
             joined_index = self.columns
             left_parts = self._partitions
@@ -2777,7 +2775,7 @@ def _compute_new_widths():
         elif (
             axis == Axis.COL_WISE
             and all(o.index.equals(self.index) for o in others)
-            and all(o._row_lengths == self._row_lengths for o in others)
+            and all(o.row_lengths == self.row_lengths for o in others)
         ):
             joined_index = self.index
             left_parts = self._partitions
@@ -3109,8 +3107,8 @@ def transpose(self):
             new_partitions,
             self.columns,
             self.index,
-            self._column_widths,
-            self._row_lengths,
+            self.column_widths,
+            self.row_lengths,
             dtypes=new_dtypes,
         )
 

@@ -267,7 +267,7 @@ def get_chunks(
         cur_n_chunks = self.num_chunks()
         n_rows = self.size()
         if n_chunks is None or n_chunks == cur_n_chunks:
-            cum_row_lengths = np.cumsum([0] + self._col._row_lengths)
+            cum_row_lengths = np.cumsum([0] + self._col.row_lengths)
             for i in range(len(cum_row_lengths) - 1):
                 yield PandasProtocolColumn(
                     self._col.take_2d_labels_or_positional(
@@ -304,9 +304,9 @@ def get_chunks(
             self._col.index,
             self._col.columns,
             new_lengths,
-            self._col._column_widths,
+            self._col.column_widths,
         )
-        cum_row_lengths = np.cumsum([0] + new_df._row_lengths)
+        cum_row_lengths = np.cumsum([0] + new_df.row_lengths)
         for i in range(len(cum_row_lengths) - 1):
             yield PandasProtocolColumn(
                 new_df.take_2d_labels_or_positional(

@@ -152,7 +152,7 @@ def get_chunks(
         cur_n_chunks = self.num_chunks()
         n_rows = self.num_rows()
         if n_chunks is None or n_chunks == cur_n_chunks:
-            cum_row_lengths = np.cumsum([0] + self._df._row_lengths)
+            cum_row_lengths = np.cumsum([0] + self._df.row_lengths)
             for i in range(len(cum_row_lengths) - 1):
                 yield PandasProtocolDataframe(
                     self._df.take_2d_labels_or_positional(
@@ -188,9 +188,9 @@ def get_chunks(
             self._df.index,
             self._df.columns,
             new_lengths,
-            self._df._column_widths,
+            self._df.column_widths,
         )
-        cum_row_lengths = np.cumsum([0] + new_df._row_lengths)
+        cum_row_lengths = np.cumsum([0] + new_df.row_lengths)
         for i in range(len(cum_row_lengths) - 1):
             yield PandasProtocolDataframe(
                 new_df.take_2d_labels_or_positional(

@@ -66,8 +66,8 @@ def synchronize_labels(self, axis=None):
             axis is not None and axis not in [0, 1]
         )
 
-        cum_row_lengths = np.cumsum([0] + self._row_lengths)
-        cum_col_widths = np.cumsum([0] + self._column_widths)
+        cum_row_lengths = np.cumsum([0] + self.row_lengths)
+        cum_col_widths = np.cumsum([0] + self.column_widths)
 
         def apply_idx_objs(df, idx, cols, axis):
             # cudf does not support set_axis. It only supports rename with 1-to-1 mapping.
@@ -180,7 +180,7 @@ def take_2d_labels_or_positional(
                 # on the partition. Often this will be the same length as the current
                 # length, but sometimes it is different, thus the extra calculation.
                 new_row_lengths = [
-                    len(range(*idx.indices(self._row_lengths[p])))
+                    len(range(*idx.indices(self.row_lengths[p])))
                     for p, idx in row_partitions_list.items()
                 ]
                 # Use the slice to calculate the new row index
@@ -189,10 +189,8 @@ def take_2d_labels_or_positional(
                 new_row_lengths = [len(idx) for _, idx in row_partitions_list.items()]
                 new_index = self.index[sorted(row_positions)]
         else:
-            row_partitions_list = {
-                i: slice(None) for i in range(len(self._row_lengths))
-            }
-            new_row_lengths = self._row_lengths
+            row_partitions_list = {i: slice(None) for i in range(len(self.row_lengths))}
+            new_row_lengths = self.row_lengths
             new_index = self.index
 
         if col_labels is not None:
@@ -204,7 +202,7 @@ def take_2d_labels_or_positional(
                 # on the partition. Often this will be the same length as the current
                 # length, but sometimes it is different, thus the extra calculation.
                 new_col_widths = [
-                    len(range(*idx.indices(self._column_widths[p])))
+                    len(range(*idx.indices(self.column_widths[p])))
                     for p, idx in col_partitions_list.items()
                 ]
                 # Use the slice to calculate the new columns
@@ -215,7 +213,7 @@ def take_2d_labels_or_positional(
                     sum(new_col_widths),
                     len(new_columns),
                     col_positions,
-                    self._column_widths,
+                    self.column_widths,
                     col_partitions_list,
                 )
                 if self._dtypes is not None:
@@ -231,9 +229,9 @@ def take_2d_labels_or_positional(
                     new_dtypes = None
         else:
             col_partitions_list = {
-                i: slice(None) for i in range(len(self._column_widths))
+                i: slice(None) for i in range(len(self.column_widths))
             }
-            new_col_widths = self._column_widths
+            new_col_widths = self.column_widths
             new_columns = self.columns
             if self._dtypes is not None:
                 new_dtypes = self.dtypes

@@ -628,7 +628,7 @@ def reset_index(self, **kwargs):
             new_self = self.copy()
             new_self.index = (
                 # Cheaper to compute row lengths than index
-                pandas.RangeIndex(sum(new_self._modin_frame._row_lengths))
+                pandas.RangeIndex(sum(new_self._modin_frame.row_lengths))
                 if new_index is None
                 else new_index
             )

@@ -100,7 +100,7 @@ def __init__(
         self.metadata = (
             data.index,
             data.columns,
-            data._query_compiler._modin_frame._row_lengths,
+            data._query_compiler._modin_frame.row_lengths,
         )
 
     def __iter__(self):

@@ -121,8 +121,8 @@ def test_apply_func_to_both_axis(has_partitions_shape_cache, has_frame_shape_cac
 
     if has_frame_shape_cache:
         # Explicitly compute rows & columns shapes to store this info in frame's cache
-        modin_frame._row_lengths
-        modin_frame._column_widths
+        modin_frame.row_lengths
+        modin_frame.column_widths
     else:
         # Explicitly reset frame's cache
         modin_frame._row_lengths_cache = None