Skip to content

Commit

Permalink
FEAT-#6574: UserWarning no longer displayed when Series/DataFrames ar…
Browse files Browse the repository at this point in the history
…e small (#7323)

Signed-off-by: Jayson Willey <[email protected]>
  • Loading branch information
Jayson729 authored Jun 19, 2024
1 parent c647cf4 commit e9ab99a
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 24 deletions.
9 changes: 6 additions & 3 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,9 +205,6 @@ def __init__(
self._query_compiler = distributed_frame._query_compiler
return

warnings.warn(
"Distributing {} object. This may take some time.".format(type(data))
)
if isinstance(data, pandas.Index):
pass
elif (
Expand Down Expand Up @@ -253,6 +250,12 @@ def __init__(
pandas_df = pandas.DataFrame(
data=data, index=index, columns=columns, dtype=dtype, copy=copy
)
if pandas_df.size >= 1_000_000:
warnings.warn(
"Distributing {} object. This may take some time.".format(
type(data)
)
)
self._query_compiler = from_pandas(pandas_df)._query_compiler
else:
self._query_compiler = query_compiler
Expand Down
28 changes: 15 additions & 13 deletions modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,26 +137,28 @@ def __init__(
query_compiler.columns = pandas.Index([MODIN_UNNAMED_SERIES_LABEL])
if query_compiler is None:
# Defaulting to pandas
warnings.warn(
"Distributing {} object. This may take some time.".format(type(data))
)
if name is None:
name = MODIN_UNNAMED_SERIES_LABEL
if isinstance(data, pandas.Series) and data.name is not None:
name = data.name

query_compiler = from_pandas(
pandas.DataFrame(
pandas.Series(
data=data,
index=index,
dtype=dtype,
name=name,
copy=copy,
fastpath=fastpath,
pandas_df = pandas.DataFrame(
pandas.Series(
data=data,
index=index,
dtype=dtype,
name=name,
copy=copy,
fastpath=fastpath,
)
)
if pandas_df.size >= 2_500_000:
warnings.warn(
"Distributing {} object. This may take some time.".format(
type(data)
)
)
)._query_compiler
query_compiler = from_pandas(pandas_df)._query_compiler
self._query_compiler = query_compiler.columnarize()
if name is not None:
self.name = name
Expand Down
34 changes: 34 additions & 0 deletions modin/tests/pandas/dataframe/test_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -1473,3 +1473,37 @@ def test_df_from_series_with_tuple_name():
df_equals(pd.DataFrame(pandas.Series(name=("a", 1))), pandas_result)
# 2. Creating a Modin DF from Modin Series
df_equals(pd.DataFrame(pd.Series(name=("a", 1))), pandas_result)


def test_large_df_warns_distributing_takes_time():
# https://github.com/modin-project/modin/issues/6574

regex = r"Distributing (.*) object\. This may take some time\."
with pytest.warns(UserWarning, match=regex):
pd.DataFrame(np.random.randint(1_000_000, size=(100_000, 10)))


def test_large_series_warns_distributing_takes_time():
# https://github.com/modin-project/modin/issues/6574

regex = r"Distributing (.*) object\. This may take some time\."
with pytest.warns(UserWarning, match=regex):
pd.Series(np.random.randint(1_000_000, size=(2_500_000)))


def test_df_does_not_warn_distributing_takes_time():
# https://github.com/modin-project/modin/issues/6574

regex = r"Distributing (.*) object\. This may take some time\."
with warnings.catch_warnings():
warnings.filterwarnings("error", regex, UserWarning)
pd.DataFrame(np.random.randint(1_000_000, size=(100_000, 9)))


def test_series_does_not_warn_distributing_takes_time():
# https://github.com/modin-project/modin/issues/6574

regex = r"Distributing (.*) object\. This may take some time\."
with warnings.catch_warnings():
warnings.filterwarnings("error", regex, UserWarning)
pd.Series(np.random.randint(1_000_000, size=(2_400_000)))
14 changes: 6 additions & 8 deletions modin/tests/pandas/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import json
import sys
import unittest.mock as mock
import warnings

import matplotlib
import numpy as np
Expand All @@ -26,7 +27,7 @@
import pytest
from numpy.testing import assert_array_equal
from pandas.core.indexing import IndexingError
from pandas.errors import SpecificationError
from pandas.errors import PerformanceWarning, SpecificationError

import modin.pandas as pd
from modin.config import Engine, NPartitions, StorageFormat
Expand Down Expand Up @@ -3429,13 +3430,10 @@ def test_sub(data):

def test_6782():
datetime_scalar = datetime.datetime(1970, 1, 1, 0, 0)
with pytest.warns(UserWarning) as warns:
_ = pd.Series([datetime.datetime(2000, 1, 1)]) - datetime_scalar
for warn in warns.list:
assert (
"Adding/subtracting object-dtype array to DatetimeArray not vectorized"
not in str(warn)
)
match = "Adding/subtracting object-dtype array to DatetimeArray not vectorized"
with warnings.catch_warnings():
warnings.filterwarnings("error", match, PerformanceWarning)
pd.Series([datetime.datetime(2000, 1, 1)]) - datetime_scalar


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
Expand Down

0 comments on commit e9ab99a

Please sign in to comment.