diff --git a/src/trousse/convert_to_mixed_type.py b/src/trousse/convert_to_mixed_type.py index 698ecee..7d36b57 100644 --- a/src/trousse/convert_to_mixed_type.py +++ b/src/trousse/convert_to_mixed_type.py @@ -2,15 +2,15 @@ import pandas as pd -class _DfConvertToMixedType: +class _ConvertDfToMixedType: """ Convert values from "object"-typed ``column`` column to appropriate format. When pandas package reads from CSV file, the columns that are not completely consistent with a single type are stored with dtype = "object" and every value is converted to "string". - This FeatureOperation subclass convert the string values to numeric, boolean - or datetime values where possible. + This class converts the string values of a column in a pandas DataFrame + to numeric, boolean or datetime values where possible. The transformed column will still have dtype="object" but the inferred type will be "mixed" which allows a correct column categorization by Dataset class. By default the converted column overwrites the related original column. @@ -193,16 +193,27 @@ def _convert_to_datetime_mixed_types(self, col_serie: pd.Series) -> None: def _set_converted_col_dtype(self, col_serie: pd.Series) -> pd.Series: """ - Set the new dtype to ``col_serie`` after conversion + Set the new dtype to ``col_serie`` after conversion. - This method sets the new col_dtype to ``col_serie`` column - TODO: Complete docs + This method updates, if possible, the ``col_serie`` column dtype. + Particularly, if each value of the column has been consistently interpreted + with a single type, the column will be converted to that dtype + (and NaNs will be converted coherently with the new dtype). + On the other hand, if column values are interpreted with multiple types, + the column will maintain the dtype="object" (and the column values will + have multiple types). Parameters ---------- col_serie : pd.Series Series containing the values that will be analyzed. It will not be modified inplace. + + Returns + ------- + pd.Series + Column with the same values as ``col_serie`` and the dtype set + according to the value types. """ if self._col_dtype is None: # If the _col_dtype is not unique and consistent, convert the column diff --git a/src/trousse/dataset.py b/src/trousse/dataset.py index e6aba76..6c79418 100644 --- a/src/trousse/dataset.py +++ b/src/trousse/dataset.py @@ -11,9 +11,9 @@ import pandas as pd from joblib import Parallel, delayed -from .convert_to_mixed_type import _DfConvertToMixedType +from .convert_to_mixed_type import _ConvertDfToMixedType from .exceptions import MultipleObjectsInFileError, NotShelveFileError -from .feature_operations import ConvertToMixedType, FeatureOperation +from .feature_operations import FeatureOperation from .operations_list import OperationsList from .settings import CATEG_COL_THRESHOLD from .util import lazy_property @@ -555,7 +555,7 @@ def _data_to_mixed_types(df: pd.DataFrame): """ str_cols = df.select_dtypes(include="object").columns for col in str_cols: - mixedtype_converter = _DfConvertToMixedType(column=col) + mixedtype_converter = _ConvertDfToMixedType(column=col) df = mixedtype_converter(df) return df diff --git a/src/trousse/feature_operations.py b/src/trousse/feature_operations.py index 7e1d04a..9e5f69c 100644 --- a/src/trousse/feature_operations.py +++ b/src/trousse/feature_operations.py @@ -3,7 +3,7 @@ from abc import ABC, abstractmethod from typing import Any, List, Mapping -from .convert_to_mixed_type import _DfConvertToMixedType +from .convert_to_mixed_type import _ConvertDfToMixedType from .util import is_sequence_and_not_str if typing.TYPE_CHECKING: @@ -302,8 +302,11 @@ class ConvertToMixedType(FeatureOperation): consistent with a single type are stored with dtype = "object" and every value is converted to "string". This FeatureOperation subclass convert the string values to numeric, boolean - or datetime values where possible. - The transformed column will still have dtype="object" but the inferred type will + or datetime values where possible. When each value can consistently be + interpreted with a single type, the column will be converted to that dtype + (and NaNs will be converted appropriately). + On the other hand, if column values are interpreted with multiple types, + the column will maintain the dtype="object" but the inferred type will be "mixed" which allows a correct column categorization by Dataset class. By default the converted column overwrites the related original column. To store the result of conversion in another column, ``derived_columns`` @@ -377,7 +380,8 @@ def __eq__(self, other: Any) -> bool: return False def _apply(self, dataset: "Dataset") -> "Dataset": - """Apply ReplaceStrings operation on a new Dataset instance and return it. + """ + Apply ConvertToMixedType operation on a new Dataset instance and return it. Parameters ---------- @@ -391,7 +395,7 @@ def _apply(self, dataset: "Dataset") -> "Dataset": """ dataset = copy.deepcopy(dataset) - mixedtype_converter = _DfConvertToMixedType( + mixedtype_converter = _ConvertDfToMixedType( column=self.columns[0], derived_column=self.derived_columns[0] ) dataset.data = mixedtype_converter(dataset.data)