opensearch-project · dhrubo-os · Jan 6, 2025 · Oct 29, 2024 · Oct 29, 2024 · Nov 3, 2024
@@ -48,6 +48,8 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - Update model upload history -  opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini (v.1.0.0)(TORCH_SCRIPT) by @dhrubo-os ([#417](https://github.com/opensearch-project/opensearch-py-ml/pull/417))
 - Update model upload history -  opensearch-project/opensearch-neural-sparse-encoding-v2-distill (v.1.0.0)(TORCH_SCRIPT) by @dhrubo-os ([#419](https://github.com/opensearch-project/opensearch-py-ml/pull/419))
 - Upgrade GitHub Actions workflows to use `@v4` to prevent deprecation issues with `@v3` by @yerzhaisang ([#428](https://github.com/opensearch-project/opensearch-py-ml/pull/428))
+- Bump pandas from 1.5.3 to the latest stable version by @yerzhaisang ([#422](https://github.com/opensearch-project/opensearch-py-ml/pull/422))
+
 
 ### Fixed
 - Fix the wrong final zip file name in model_uploader workflow, now will name it by the upload_prefix alse.([#413](https://github.com/opensearch-project/opensearch-py-ml/pull/413/files))

@@ -1,5 +1,5 @@
 opensearch-py>=2
-pandas>=1.5,<3
+pandas>=1.5.2,<2.3,!=2.1.0
 matplotlib>=3.6.0,<4
 nbval
 sphinx

@@ -55,14 +55,33 @@
 
 
 def build_pd_series(
-    data: Dict[str, Any], dtype: Optional["DTypeLike"] = None, **kwargs: Any
+    data: Dict[str, Any],
+    dtype: Optional["DTypeLike"] = None,
+    index_name: Optional[str] = None,
+    **kwargs: Any,
 ) -> pd.Series:
-    """Builds a pd.Series while squelching the warning
-    for unspecified dtype on empty series
     """
+    Builds a pandas Series from a dictionary, optionally setting an index name.
+
+    Parameters:
+    data : Dict[str, Any]
+        The data to build the Series from, with keys as the index.
+    dtype : Optional[DTypeLike]
+        The desired data type of the Series. If not specified, uses EMPTY_SERIES_DTYPE if data is empty.
+    index_name : Optional[str]
+        Name to assign to the Series index, similar to `index_name` in `value_counts`.
+
+    Returns:
+    pd.Series
+        A pandas Series constructed from the given data, with the specified dtype and index name.
+    """
+
     dtype = dtype or (EMPTY_SERIES_DTYPE if not data else dtype)
     if dtype is not None:
         kwargs["dtype"] = dtype
+    if index_name is not None:
+        index = pd.Index(data.keys(), name=index_name)
+        kwargs["index"] = index
     return pd.Series(data, **kwargs)
 
 

@@ -47,14 +47,17 @@
 from opensearch_py_ml.groupby import DataFrameGroupBy
 from opensearch_py_ml.ndframe import NDFrame
 from opensearch_py_ml.series import Series
-from opensearch_py_ml.utils import is_valid_attr_name
+from opensearch_py_ml.utils import is_valid_attr_name, to_list_if_needed
 
 if TYPE_CHECKING:
     from opensearchpy import OpenSearch
 
     from .query_compiler import QueryCompiler
 
 
+PANDAS_MAJOR_VERSION = int(pd.__version__.split(".")[0])
+
+
 class DataFrame(NDFrame):
     """
     Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes
@@ -275,22 +278,13 @@ def tail(self, n: int = 5) -> "DataFrame":
         >>> from tests import OPENSEARCH_TEST_CLIENT
 
         >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights', columns=['Origin', 'Dest'])
-        >>> df.tail()
-                                                                    Origin  \\
-        13054                                   Pisa International Airport...
-        13055  Winnipeg / James Armstrong Richardson International Airport...
-        13056               Licenciado Benito Juarez International Airport...
-        13057                                                Itami Airport...
-        13058                               Adelaide International Airport...
-        <BLANKLINE>
-                                                   Dest...
-        13054      Xi'an Xianyang International Airport...
-        13055                            Zurich Airport...
-        13056                         Ukrainka Air Base...
-        13057  Ministro Pistarini International Airport...
-        13058   Washington Dulles International Airport...
-        <BLANKLINE>
-        [5 rows x 2 columns]
+        >>> print(df.tail().to_string().strip())
+        Origin                                      Dest
+        13054                                   Pisa International Airport      Xi'an Xianyang International Airport
+        13055  Winnipeg / James Armstrong Richardson International Airport                            Zurich Airport
+        13056               Licenciado Benito Juarez International Airport                         Ukrainka Air Base
+        13057                                                Itami Airport  Ministro Pistarini International Airport
+        13058                               Adelaide International Airport   Washington Dulles International Airport
         """
         return DataFrame(_query_compiler=self._query_compiler.tail(n))
 
@@ -424,9 +418,14 @@ def drop(
             axis = pd.DataFrame._get_axis_name(axis)
             axes = {axis: labels}
         elif index is not None or columns is not None:
-            axes, _ = pd.DataFrame()._construct_axes_from_arguments(
-                (index, columns), {}
-            )
+            axes = {
+                "index": to_list_if_needed(index),
+                "columns": (
+                    pd.Index(to_list_if_needed(columns))
+                    if columns is not None
+                    else None
+                ),
+            }
         else:
             raise ValueError(
                 "Need to specify at least one of 'labels', 'index' or 'columns'"
@@ -440,7 +439,7 @@ def drop(
                 axes["index"] = [axes["index"]]
             if errors == "raise":
                 # Check if axes['index'] values exists in index
-                count = self._query_compiler._index_matches_count(axes["index"])
+                count = self._query_compiler._index_matches_count(list(axes["index"]))
                 if count != len(axes["index"]):
                     raise ValueError(
                         f"number of labels {count}!={len(axes['index'])} not contained in axis"
@@ -1341,6 +1340,10 @@ def to_csv(
         --------
         :pandas_api_docs:`pandas.DataFrame.to_csv`
         """
+        if PANDAS_MAJOR_VERSION < 2:
+            line_terminator_keyword = "line_terminator"
+        else:
+            line_terminator_keyword = "lineterminator"
         kwargs = {
             "path_or_buf": path_or_buf,
             "sep": sep,
@@ -1355,7 +1358,7 @@ def to_csv(
             "compression": compression,
             "quoting": quoting,
             "quotechar": quotechar,
-            "line_terminator": line_terminator,
+            line_terminator_keyword: line_terminator,
             "chunksize": chunksize,
             "date_format": date_format,
             "doublequote": doublequote,

@@ -108,6 +108,7 @@ def pandas_to_opensearch(
     ...                            'G': [1, 2, 3],
     ...                            'H': 'Long text - to be indexed as os type text'},
     ...                      index=['0', '1', '2'])
+    >>> pd_df['D'] = pd_df['D'].astype('datetime64[ns]')
     >>> type(pd_df)
     <class 'pandas.core.frame.DataFrame'>
     >>> pd_df

@@ -26,6 +26,7 @@
 from typing import TYPE_CHECKING, List, Optional, Union
 
 from opensearch_py_ml.query_compiler import QueryCompiler
+from opensearch_py_ml.utils import MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE
 
 if TYPE_CHECKING:
     import pandas as pd  # type: ignore
@@ -153,7 +154,7 @@ def var(self, numeric_only: bool = True) -> "pd.DataFrame":
         """
         return self._query_compiler.aggs_groupby(
             by=self._by,
-            pd_aggs=["var"],
+            pd_aggs=[VARIANCE],
             dropna=self._dropna,
             numeric_only=numeric_only,
         )
@@ -206,7 +207,7 @@ def std(self, numeric_only: bool = True) -> "pd.DataFrame":
         """
         return self._query_compiler.aggs_groupby(
             by=self._by,
-            pd_aggs=["std"],
+            pd_aggs=[STANDARD_DEVIATION],
             dropna=self._dropna,
             numeric_only=numeric_only,
         )
@@ -259,7 +260,7 @@ def mad(self, numeric_only: bool = True) -> "pd.DataFrame":
         """
         return self._query_compiler.aggs_groupby(
             by=self._by,
-            pd_aggs=["mad"],
+            pd_aggs=[MEAN_ABSOLUTE_DEVIATION],
             dropna=self._dropna,
             numeric_only=numeric_only,
         )

@@ -65,6 +65,7 @@
     SizeTask,
     TailTask,
 )
+from opensearch_py_ml.utils import MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE
 
 if TYPE_CHECKING:
     from numpy.typing import DTypeLike
@@ -75,6 +76,8 @@
     from opensearch_py_ml.query_compiler import QueryCompiler
     from opensearch_py_ml.tasks import Task
 
+PANDAS_MAJOR_VERSION = int(pd.__version__.split(".")[0])
+
 
 class QueryParams:
     def __init__(self) -> None:
@@ -475,7 +478,10 @@ def _terms_aggs(
         except IndexError:
             name = None
 
-        return build_pd_series(results, name=name)
+        if PANDAS_MAJOR_VERSION < 2:
+            return build_pd_series(results, name=name)
+        else:
+            return build_pd_series(results, index_name=name, name="count")
 
     def _hist_aggs(
         self, query_compiler: "QueryCompiler", num_bins: int
@@ -620,7 +626,7 @@ def _unpack_metric_aggs(
                         values.append(field.nan_value)
                     # Explicit condition for mad to add NaN because it doesn't support bool
                     elif is_dataframe_agg and numeric_only:
-                        if pd_agg == "mad":
+                        if pd_agg == MEAN_ABSOLUTE_DEVIATION:
                             values.append(field.nan_value)
                     continue
 
@@ -1097,7 +1103,14 @@ def _map_pd_aggs_to_os_aggs(
         """
         # pd aggs that will be mapped to os aggs
         # that can use 'extended_stats'.
-        extended_stats_pd_aggs = {"mean", "min", "max", "sum", "var", "std"}
+        extended_stats_pd_aggs = {
+            "mean",
+            "min",
+            "max",
+            "sum",
+            VARIANCE,
+            STANDARD_DEVIATION,
+        }
         extended_stats_os_aggs = {"avg", "min", "max", "sum"}
         extended_stats_calls = 0
 
@@ -1117,15 +1130,15 @@ def _map_pd_aggs_to_os_aggs(
                 os_aggs.append("avg")
             elif pd_agg == "sum":
                 os_aggs.append("sum")
-            elif pd_agg == "std":
+            elif pd_agg == STANDARD_DEVIATION:
                 os_aggs.append(("extended_stats", "std_deviation"))
-            elif pd_agg == "var":
+            elif pd_agg == VARIANCE:
                 os_aggs.append(("extended_stats", "variance"))
 
             # Aggs that aren't 'extended_stats' compatible
             elif pd_agg == "nunique":
                 os_aggs.append("cardinality")
-            elif pd_agg == "mad":
+            elif pd_agg == MEAN_ABSOLUTE_DEVIATION:
                 os_aggs.append("median_absolute_deviation")
             elif pd_agg == "median":
                 os_aggs.append(("percentiles", (50.0,)))
@@ -1205,7 +1218,7 @@ def describe(self, query_compiler: "QueryCompiler") -> pd.DataFrame:
 
         df1 = self.aggs(
             query_compiler=query_compiler,
-            pd_aggs=["count", "mean", "std", "min", "max"],
+            pd_aggs=["count", "mean", "min", "max", STANDARD_DEVIATION],
             numeric_only=True,
         )
         df2 = self.quantile(
@@ -1219,9 +1232,37 @@ def describe(self, query_compiler: "QueryCompiler") -> pd.DataFrame:
         # Convert [.25,.5,.75] to ["25%", "50%", "75%"]
         df2 = df2.set_index([["25%", "50%", "75%"]])
 
-        return pd.concat([df1, df2]).reindex(
-            ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
-        )
+        df = pd.concat([df1, df2])
+
+        if PANDAS_MAJOR_VERSION < 2:
+            return pd.concat([df1, df2]).reindex(
+                ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
+            )
+        else:
+            # Note: In recent pandas versions, `describe()` returns a different index order
+            # for one-column DataFrames compared to multi-column DataFrames.
+            # We adjust the order manually to ensure consistency.
+            if df.shape[1] == 1:
+                # For single-column DataFrames, `describe()` typically outputs:
+                # ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
+                return df.reindex(
+                    [
+                        "count",
+                        "mean",
+                        STANDARD_DEVIATION,
+                        "min",
+                        "25%",
+                        "50%",
+                        "75%",
+                        "max",
+                    ]
+                )
+
+            # For multi-column DataFrames, `describe()` typically outputs:
+            # ["count", "mean", "min", "25%", "50%", "75%", "max", "std"]
+            return df.reindex(
+                ["count", "mean", "min", "25%", "50%", "75%", "max", STANDARD_DEVIATION]
+            )
 
     def to_pandas(
         self, query_compiler: "QueryCompiler", show_progress: bool = False

@@ -45,6 +45,7 @@
 from opensearch_py_ml.filter import BooleanFilter, QueryFilter
 from opensearch_py_ml.index import Index
 from opensearch_py_ml.operations import Operations
+from opensearch_py_ml.utils import MEAN_ABSOLUTE_DEVIATION, STANDARD_DEVIATION, VARIANCE
 
 if TYPE_CHECKING:
     from opensearchpy import OpenSearch
@@ -587,17 +588,17 @@ def mean(self, numeric_only: Optional[bool] = None) -> pd.Series:
 
     def var(self, numeric_only: Optional[bool] = None) -> pd.Series:
         return self._operations._metric_agg_series(
-            self, ["var"], numeric_only=numeric_only
+            self, [VARIANCE], numeric_only=numeric_only
         )
 
     def std(self, numeric_only: Optional[bool] = None) -> pd.Series:
         return self._operations._metric_agg_series(
-            self, ["std"], numeric_only=numeric_only
+            self, [STANDARD_DEVIATION], numeric_only=numeric_only
         )
 
     def mad(self, numeric_only: Optional[bool] = None) -> pd.Series:
         return self._operations._metric_agg_series(
-            self, ["mad"], numeric_only=numeric_only
+            self, [MEAN_ABSOLUTE_DEVIATION], numeric_only=numeric_only
         )
 
     def median(self, numeric_only: Optional[bool] = None) -> pd.Series:

@@ -311,12 +311,12 @@ def value_counts(self, os_size: int = 10) -> pd.Series:
         >>> from tests import OPENSEARCH_TEST_CLIENT
 
         >>> df = oml.DataFrame(OPENSEARCH_TEST_CLIENT, 'flights')
-        >>> df['Carrier'].value_counts()
-        Logstash Airways    3331
-        JetBeats            3274
-        Kibana Airlines     3234
-        ES-Air              3220
-        Name: Carrier, dtype: int64
+        >>> for key, value in df['Carrier'].value_counts().items():
+        ...     print(key, value)
+        Logstash Airways 3331
+        JetBeats 3274
+        Kibana Airlines 3234
+        ES-Air 3220
         """
         if not isinstance(os_size, int):
             raise TypeError("os_size must be a positive integer.")