Skip to content

Commit

Permalink
Renamed to _ConvertDfToMixedType.
Browse files Browse the repository at this point in the history
Added some missing comments
  • Loading branch information
lorenz-gorini committed Oct 24, 2020
1 parent 25f9497 commit 8c1b559
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 14 deletions.
23 changes: 17 additions & 6 deletions src/trousse/convert_to_mixed_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@
import pandas as pd


class _DfConvertToMixedType:
class _ConvertDfToMixedType:
"""
Convert values from "object"-typed ``column`` column to appropriate format.
When pandas package reads from CSV file, the columns that are not completely
consistent with a single type are stored with dtype = "object" and every value
is converted to "string".
This FeatureOperation subclass convert the string values to numeric, boolean
or datetime values where possible.
This class converts the string values of a column in a pandas DataFrame
to numeric, boolean or datetime values where possible.
The transformed column will still have dtype="object" but the inferred type will
be "mixed" which allows a correct column categorization by Dataset class.
By default the converted column overwrites the related original column.
Expand Down Expand Up @@ -193,16 +193,27 @@ def _convert_to_datetime_mixed_types(self, col_serie: pd.Series) -> None:

def _set_converted_col_dtype(self, col_serie: pd.Series) -> pd.Series:
"""
Set the new dtype to ``col_serie`` after conversion
Set the new dtype to ``col_serie`` after conversion.
This method sets the new col_dtype to ``col_serie`` column
TODO: Complete docs
This method updates, if possible, the ``col_serie`` column dtype.
Particularly, if each value of the column has been consistently interpreted
with a single type, the column will be converted to that dtype
(and NaNs will be converted coherently with the new dtype).
On the other hand, if column values are interpreted with multiple types,
the column will maintain the dtype="object" (and the column values will
have multiple types).
Parameters
----------
col_serie : pd.Series
Series containing the values that will be analyzed. It will not be
modified inplace.
Returns
-------
pd.Series
Column with the same values as ``col_serie`` and the dtype set
according to the value types.
"""
if self._col_dtype is None:
# If the _col_dtype is not unique and consistent, convert the column
Expand Down
6 changes: 3 additions & 3 deletions src/trousse/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
import pandas as pd
from joblib import Parallel, delayed

from .convert_to_mixed_type import _DfConvertToMixedType
from .convert_to_mixed_type import _ConvertDfToMixedType
from .exceptions import MultipleObjectsInFileError, NotShelveFileError
from .feature_operations import ConvertToMixedType, FeatureOperation
from .feature_operations import FeatureOperation
from .operations_list import OperationsList
from .settings import CATEG_COL_THRESHOLD
from .util import lazy_property
Expand Down Expand Up @@ -555,7 +555,7 @@ def _data_to_mixed_types(df: pd.DataFrame):
"""
str_cols = df.select_dtypes(include="object").columns
for col in str_cols:
mixedtype_converter = _DfConvertToMixedType(column=col)
mixedtype_converter = _ConvertDfToMixedType(column=col)
df = mixedtype_converter(df)

return df
Expand Down
14 changes: 9 additions & 5 deletions src/trousse/feature_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from abc import ABC, abstractmethod
from typing import Any, List, Mapping

from .convert_to_mixed_type import _DfConvertToMixedType
from .convert_to_mixed_type import _ConvertDfToMixedType
from .util import is_sequence_and_not_str

if typing.TYPE_CHECKING:
Expand Down Expand Up @@ -302,8 +302,11 @@ class ConvertToMixedType(FeatureOperation):
consistent with a single type are stored with dtype = "object" and every value
is converted to "string".
This FeatureOperation subclass convert the string values to numeric, boolean
or datetime values where possible.
The transformed column will still have dtype="object" but the inferred type will
or datetime values where possible. When each value can consistently be
interpreted with a single type, the column will be converted to that dtype
(and NaNs will be converted appropriately).
On the other hand, if column values are interpreted with multiple types,
the column will maintain the dtype="object" but the inferred type will
be "mixed" which allows a correct column categorization by Dataset class.
By default the converted column overwrites the related original column.
To store the result of conversion in another column, ``derived_columns``
Expand Down Expand Up @@ -377,7 +380,8 @@ def __eq__(self, other: Any) -> bool:
return False

def _apply(self, dataset: "Dataset") -> "Dataset":
"""Apply ReplaceStrings operation on a new Dataset instance and return it.
"""
Apply ConvertToMixedType operation on a new Dataset instance and return it.
Parameters
----------
Expand All @@ -391,7 +395,7 @@ def _apply(self, dataset: "Dataset") -> "Dataset":
"""
dataset = copy.deepcopy(dataset)

mixedtype_converter = _DfConvertToMixedType(
mixedtype_converter = _ConvertDfToMixedType(
column=self.columns[0], derived_column=self.derived_columns[0]
)
dataset.data = mixedtype_converter(dataset.data)
Expand Down

0 comments on commit 8c1b559

Please sign in to comment.