Backport PR #60321: TST (string dtype): resolve all xfails in IO pars… (

#60330) * Backport PR #60321: TST (string dtype): resolve all xfails in IO parser tests (cherry picked from commit ee3c18f) * BUG: Avoid RangeIndex conversion in read_csv if dtype is specified (#59316) Co-authored-by: Joris Van den Bossche <[email protected]> Co-authored-by: Matthew Roeschke <[email protected]>
pandas-dev · Nov 18, 2024 · 112c2e9 · 112c2e9
1 parent 0bcd250
commit 112c2e9
Show file tree

Hide file tree

Showing 12 changed files with 89 additions and 71 deletions.
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -464,7 +464,11 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
         arrays = []
         converters = self._clean_mapping(self.converters)
 
-        for i, arr in enumerate(index):
+        if self.index_names is not None:
+            names: Iterable = self.index_names
+        else:
+            names = itertools.cycle([None])
+        for i, (arr, name) in enumerate(zip(index, names)):
             if try_parse_dates and self._should_parse_dates(i):
                 arr = self._date_conv(
                     arr,
@@ -504,12 +508,17 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
             arr, _ = self._infer_types(
                 arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool
             )
-            arrays.append(arr)
-
-        names = self.index_names
-        index = ensure_index_from_sequences(arrays, names)
+            if cast_type is not None:
+                # Don't perform RangeIndex inference
+                idx = Index(arr, name=name, dtype=cast_type)
+            else:
+                idx = ensure_index_from_sequences([arr], [name])
+            arrays.append(idx)
 
-        return index
+        if len(arrays) == 1:
+            return arrays[0]
+        else:
+            return MultiIndex.from_arrays(arrays)
 
     @final
     def _convert_to_ndarrays(
@@ -1084,12 +1093,11 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None):
         dtype_dict: defaultdict[Hashable, Any]
         if not is_dict_like(dtype):
             # if dtype == None, default will be object.
-            default_dtype = dtype or object
-            dtype_dict = defaultdict(lambda: default_dtype)
+            dtype_dict = defaultdict(lambda: dtype)
         else:
             dtype = cast(dict, dtype)
             dtype_dict = defaultdict(
-                lambda: object,
+                lambda: None,
                 {columns[k] if is_integer(k) else k: v for k, v in dtype.items()},
             )
 
@@ -1106,8 +1114,14 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None):
         if (index_col is None or index_col is False) or index_names is None:
             index = default_index(0)
         else:
-            data = [Series([], dtype=dtype_dict[name]) for name in index_names]
-            index = ensure_index_from_sequences(data, names=index_names)
+            # TODO: We could return default_index(0) if dtype_dict[name] is None
+            data = [
+                Index([], name=name, dtype=dtype_dict[name]) for name in index_names
+            ]
+            if len(data) == 1:
+                index = data[0]
+            else:
+                index = MultiIndex.from_arrays(data)
             index_col.sort()
 
             for i, n in enumerate(index_col):

diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py
@@ -7,8 +7,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs import parsers as libparsers
 from pandas.errors import DtypeWarning
 
@@ -230,8 +228,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
     assert result.a.dtype == float
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
-def test_warn_if_chunks_have_mismatched_type(all_parsers):
+def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string):
     warning_type = None
     parser = all_parsers
     size = 10000
@@ -259,8 +256,12 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
             "Specify dtype option on import or set low_memory=False.",
             buf,
         )
-
-    assert df.a.dtype == object
+    if parser.engine == "c" and parser.low_memory:
+        assert df.a.dtype == object
+    elif using_infer_string:
+        assert df.a.dtype == "str"
+    else:
+        assert df.a.dtype == object
 
 
 @pytest.mark.parametrize("iterator", [True, False])

diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py
@@ -14,8 +14,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import (
     EmptyDataError,
     ParserError,
@@ -69,14 +67,13 @@ def test_local_file(all_parsers, csv_dir_path):
         pytest.skip("Failing on: " + " ".join(platform.uname()))
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @xfail_pyarrow  # AssertionError: DataFrame.index are different
 def test_path_path_lib(all_parsers):
     parser = all_parsers
     df = DataFrame(
         1.1 * np.arange(120).reshape((30, 4)),
-        columns=Index(list("ABCD"), dtype=object),
-        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        columns=Index(list("ABCD")),
+        index=Index([f"i-{i}" for i in range(30)]),
     )
     result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
     tm.assert_frame_equal(df, result)

diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py
@@ -8,8 +8,6 @@
 
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     DataFrame,
     Index,
@@ -87,9 +85,13 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
-def test_multi_index_no_level_names(all_parsers, index_col):
+def test_multi_index_no_level_names(
+    request, all_parsers, index_col, using_infer_string
+):
+    if using_infer_string and all_parsers.engine == "pyarrow":
+        # result should have string columns instead of object dtype
+        request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
     data = """index1,index2,A,B,C,D
 foo,one,2,3,4,5
 foo,two,7,8,9,10

diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -8,8 +8,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import ParserWarning
 
 import pandas as pd
@@ -24,6 +22,8 @@
     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
 )
 
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
+
 
 @pytest.mark.parametrize("dtype", [str, object])
 @pytest.mark.parametrize("check_orig", [True, False])
@@ -54,7 +54,6 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string):
         tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.usefixtures("pyarrow_xfail")
 def test_dtype_per_column(all_parsers):
     parser = all_parsers
@@ -68,7 +67,6 @@ def test_dtype_per_column(all_parsers):
         [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
     )
     expected["one"] = expected["one"].astype(np.float64)
-    expected["two"] = expected["two"].astype(object)
 
     result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
     tm.assert_frame_equal(result, expected)
@@ -598,6 +596,7 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
     tm.assert_frame_equal(result, expected)
 
 
+@xfail_pyarrow
 def test_accurate_parsing_of_large_integers(all_parsers):
     # GH#52505
     data = """SYMBOL,MOMENT,ID,ID_DEAL
@@ -608,7 +607,7 @@ def test_accurate_parsing_of_large_integers(all_parsers):
 AMZN,20230301181139587,2023552585717889759,2023552585717263360
 MSFT,20230301181139587,2023552585717889863,2023552585717263361
 NVDA,20230301181139587,2023552585717889827,2023552585717263361"""
-    orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
+    orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
     assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1
     assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1
     assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2
@@ -630,3 +629,16 @@ def test_dtypes_with_usecols(all_parsers):
         values = ["1", "4"]
     expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]})
     tm.assert_frame_equal(result, expected)
+
+
+def test_index_col_with_dtype_no_rangeindex(all_parsers):
+    data = StringIO("345.5,519.5,0\n519.5,726.5,1")
+    result = all_parsers.read_csv(
+        data,
+        header=None,
+        names=["start", "stop", "bin_id"],
+        dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32},
+        index_col="bin_id",
+    ).index
+    expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id")
+    tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
@@ -17,8 +17,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat.numpy import np_version_gte1p24
 from pandas.errors import (
     ParserError,
@@ -185,8 +183,7 @@ def error(val: float, actual_val: Decimal) -> Decimal:
     assert max(precise_errors) <= max(normal_errors)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-def test_usecols_dtypes(c_parser_only):
+def test_usecols_dtypes(c_parser_only, using_infer_string):
     parser = c_parser_only
     data = """\
 1,2,3
@@ -211,8 +208,12 @@ def test_usecols_dtypes(c_parser_only):
         dtype={"b": int, "c": float},
     )
 
-    assert (result.dtypes == [object, int, float]).all()
-    assert (result2.dtypes == [object, float]).all()
+    if using_infer_string:
+        assert (result.dtypes == ["string", int, float]).all()
+        assert (result2.dtypes == ["string", float]).all()
+    else:
+        assert (result.dtypes == [object, int, float]).all()
+        assert (result2.dtypes == [object, float]).all()
 
 
 def test_disable_bool_parsing(c_parser_only):

diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py
@@ -8,8 +8,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -186,7 +184,6 @@ def convert_score(x):
         tm.assert_frame_equal(results[0], results[1])
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("conv_f", [lambda x: x, str])
 def test_converter_index_col_bug(all_parsers, conv_f):
     # see gh-1835 , GH#40589
@@ -205,7 +202,7 @@ def test_converter_index_col_bug(all_parsers, conv_f):
         StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
     )
 
-    xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
+    xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A"))
     tm.assert_frame_equal(rs, xp)
 
 

diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
@@ -8,8 +8,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     DataFrame,
     Index,
@@ -344,7 +342,6 @@ def test_infer_types_boolean_sum(all_parsers):
     tm.assert_frame_equal(result, expected, check_index_type=False)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
 def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
     # GH#9435
@@ -355,7 +352,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
             pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine")
         )
     result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
-    expected = DataFrame({"b": [2]}, index=Index([val], name="a"))
+    expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype))
     tm.assert_frame_equal(result, expected)
 
 

diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py
@@ -7,9 +7,10 @@
 
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas import DataFrame
+from pandas import (
+    DataFrame,
+    Index,
+)
 import pandas._testing as tm
 
 xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
@@ -120,7 +121,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected):
         parser.read_csv(StringIO(data), names=names)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @xfail_pyarrow  # AssertionError: DataFrame.columns are different
 def test_mangled_unnamed_placeholders(all_parsers):
     # xref gh-13017
@@ -132,7 +132,7 @@ def test_mangled_unnamed_placeholders(all_parsers):
 
     # This test recursively updates `df`.
     for i in range(3):
-        expected = DataFrame()
+        expected = DataFrame(columns=Index([], dtype="str"))
 
         for j in range(i + 1):
             col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)