Skip to content

Commit

Permalink
Backport PR #60321: TST (string dtype): resolve all xfails in IO pars… (
Browse files Browse the repository at this point in the history
#60330)

* Backport PR #60321: TST (string dtype): resolve all xfails in IO parser tests

(cherry picked from commit ee3c18f)

* BUG: Avoid RangeIndex conversion in read_csv if dtype is specified (#59316)


Co-authored-by: Joris Van den Bossche <[email protected]>
Co-authored-by: Matthew Roeschke <[email protected]>
  • Loading branch information
3 people authored Nov 18, 2024
1 parent 0bcd250 commit 112c2e9
Show file tree
Hide file tree
Showing 12 changed files with 89 additions and 71 deletions.
36 changes: 25 additions & 11 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,11 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
arrays = []
converters = self._clean_mapping(self.converters)

for i, arr in enumerate(index):
if self.index_names is not None:
names: Iterable = self.index_names
else:
names = itertools.cycle([None])
for i, (arr, name) in enumerate(zip(index, names)):
if try_parse_dates and self._should_parse_dates(i):
arr = self._date_conv(
arr,
Expand Down Expand Up @@ -504,12 +508,17 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
arr, _ = self._infer_types(
arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool
)
arrays.append(arr)

names = self.index_names
index = ensure_index_from_sequences(arrays, names)
if cast_type is not None:
# Don't perform RangeIndex inference
idx = Index(arr, name=name, dtype=cast_type)
else:
idx = ensure_index_from_sequences([arr], [name])
arrays.append(idx)

return index
if len(arrays) == 1:
return arrays[0]
else:
return MultiIndex.from_arrays(arrays)

@final
def _convert_to_ndarrays(
Expand Down Expand Up @@ -1084,12 +1093,11 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None):
dtype_dict: defaultdict[Hashable, Any]
if not is_dict_like(dtype):
# if dtype == None, default will be object.
default_dtype = dtype or object
dtype_dict = defaultdict(lambda: default_dtype)
dtype_dict = defaultdict(lambda: dtype)
else:
dtype = cast(dict, dtype)
dtype_dict = defaultdict(
lambda: object,
lambda: None,
{columns[k] if is_integer(k) else k: v for k, v in dtype.items()},
)

Expand All @@ -1106,8 +1114,14 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None):
if (index_col is None or index_col is False) or index_names is None:
index = default_index(0)
else:
data = [Series([], dtype=dtype_dict[name]) for name in index_names]
index = ensure_index_from_sequences(data, names=index_names)
# TODO: We could return default_index(0) if dtype_dict[name] is None
data = [
Index([], name=name, dtype=dtype_dict[name]) for name in index_names
]
if len(data) == 1:
index = data[0]
else:
index = MultiIndex.from_arrays(data)
index_col.sort()

for i, n in enumerate(index_col):
Expand Down
13 changes: 7 additions & 6 deletions pandas/tests/io/parser/common/test_chunksize.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs import parsers as libparsers
from pandas.errors import DtypeWarning

Expand Down Expand Up @@ -230,8 +228,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
assert result.a.dtype == float


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_warn_if_chunks_have_mismatched_type(all_parsers):
def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string):
warning_type = None
parser = all_parsers
size = 10000
Expand Down Expand Up @@ -259,8 +256,12 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
"Specify dtype option on import or set low_memory=False.",
buf,
)

assert df.a.dtype == object
if parser.engine == "c" and parser.low_memory:
assert df.a.dtype == object
elif using_infer_string:
assert df.a.dtype == "str"
else:
assert df.a.dtype == object


@pytest.mark.parametrize("iterator", [True, False])
Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/io/parser/common/test_file_buffer_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import (
EmptyDataError,
ParserError,
Expand Down Expand Up @@ -69,14 +67,13 @@ def test_local_file(all_parsers, csv_dir_path):
pytest.skip("Failing on: " + " ".join(platform.uname()))


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@xfail_pyarrow # AssertionError: DataFrame.index are different
def test_path_path_lib(all_parsers):
parser = all_parsers
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
tm.assert_frame_equal(df, result)
Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/io/parser/common/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

import pytest

from pandas._config import using_string_dtype

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -87,9 +85,13 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
def test_multi_index_no_level_names(all_parsers, index_col):
def test_multi_index_no_level_names(
request, all_parsers, index_col, using_infer_string
):
if using_infer_string and all_parsers.engine == "pyarrow":
# result should have string columns instead of object dtype
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
data = """index1,index2,A,B,C,D
foo,one,2,3,4,5
foo,two,7,8,9,10
Expand Down
22 changes: 17 additions & 5 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import ParserWarning

import pandas as pd
Expand All @@ -24,6 +22,8 @@
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)

xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")


@pytest.mark.parametrize("dtype", [str, object])
@pytest.mark.parametrize("check_orig", [True, False])
Expand Down Expand Up @@ -54,7 +54,6 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_per_column(all_parsers):
parser = all_parsers
Expand All @@ -68,7 +67,6 @@ def test_dtype_per_column(all_parsers):
[[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
)
expected["one"] = expected["one"].astype(np.float64)
expected["two"] = expected["two"].astype(object)

result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -598,6 +596,7 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_accurate_parsing_of_large_integers(all_parsers):
# GH#52505
data = """SYMBOL,MOMENT,ID,ID_DEAL
Expand All @@ -608,7 +607,7 @@ def test_accurate_parsing_of_large_integers(all_parsers):
AMZN,20230301181139587,2023552585717889759,2023552585717263360
MSFT,20230301181139587,2023552585717889863,2023552585717263361
NVDA,20230301181139587,2023552585717889827,2023552585717263361"""
orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2
Expand All @@ -630,3 +629,16 @@ def test_dtypes_with_usecols(all_parsers):
values = ["1", "4"]
expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]})
tm.assert_frame_equal(result, expected)


def test_index_col_with_dtype_no_rangeindex(all_parsers):
data = StringIO("345.5,519.5,0\n519.5,726.5,1")
result = all_parsers.read_csv(
data,
header=None,
names=["start", "stop", "bin_id"],
dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32},
index_col="bin_id",
).index
expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id")
tm.assert_index_equal(result, expected)
13 changes: 7 additions & 6 deletions pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat.numpy import np_version_gte1p24
from pandas.errors import (
ParserError,
Expand Down Expand Up @@ -185,8 +183,7 @@ def error(val: float, actual_val: Decimal) -> Decimal:
assert max(precise_errors) <= max(normal_errors)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_usecols_dtypes(c_parser_only):
def test_usecols_dtypes(c_parser_only, using_infer_string):
parser = c_parser_only
data = """\
1,2,3
Expand All @@ -211,8 +208,12 @@ def test_usecols_dtypes(c_parser_only):
dtype={"b": int, "c": float},
)

assert (result.dtypes == [object, int, float]).all()
assert (result2.dtypes == [object, float]).all()
if using_infer_string:
assert (result.dtypes == ["string", int, float]).all()
assert (result2.dtypes == ["string", float]).all()
else:
assert (result.dtypes == [object, int, float]).all()
assert (result2.dtypes == [object, float]).all()


def test_disable_bool_parsing(c_parser_only):
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/io/parser/test_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -186,7 +184,6 @@ def convert_score(x):
tm.assert_frame_equal(results[0], results[1])


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("conv_f", [lambda x: x, str])
def test_converter_index_col_bug(all_parsers, conv_f):
# see gh-1835 , GH#40589
Expand All @@ -205,7 +202,7 @@ def test_converter_index_col_bug(all_parsers, conv_f):
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
)

xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A"))
tm.assert_frame_equal(rs, xp)


Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/io/parser/test_index_col.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -344,7 +342,6 @@ def test_infer_types_boolean_sum(all_parsers):
tm.assert_frame_equal(result, expected, check_index_type=False)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
# GH#9435
Expand All @@ -355,7 +352,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine")
)
result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
expected = DataFrame({"b": [2]}, index=Index([val], name="a"))
expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype))
tm.assert_frame_equal(result, expected)


Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/io/parser/test_mangle_dupes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@

import pytest

from pandas._config import using_string_dtype

from pandas import DataFrame
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm

xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
Expand Down Expand Up @@ -120,7 +121,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected):
parser.read_csv(StringIO(data), names=names)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@xfail_pyarrow # AssertionError: DataFrame.columns are different
def test_mangled_unnamed_placeholders(all_parsers):
# xref gh-13017
Expand All @@ -132,7 +132,7 @@ def test_mangled_unnamed_placeholders(all_parsers):

# This test recursively updates `df`.
for i in range(3):
expected = DataFrame()
expected = DataFrame(columns=Index([], dtype="str"))

for j in range(i + 1):
col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)
Expand Down
Loading

0 comments on commit 112c2e9

Please sign in to comment.