Skip to content

Commit

Permalink
Gradually adapt encoding functions in feature_fix to new FeatureOpera…
Browse files Browse the repository at this point in the history
…tion usage
  • Loading branch information
alessiamarcolini committed Nov 19, 2020
1 parent a447e2c commit b2d991f
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 60 deletions.
80 changes: 27 additions & 53 deletions src/trousse/feature_fix.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@
from typing import Any, Tuple

import numpy as np
import pandas as pd

from .dataset import Dataset, copy_dataset_with_new_df
from .dataset import Dataset
from .feature_operations import FeatureOperation, OneHotEncoder, OrdinalEncoder

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -176,10 +175,9 @@ def combine_categorical_columns_to_one(


def _one_hot_encode_column(
df: pd.DataFrame,
dataset: Dataset,
column: str,
drop_one_new_column: bool = True,
drop_old_column: bool = False,
):
"""
OneHotEncoding of 'column' in df
Expand All @@ -189,54 +187,52 @@ def _one_hot_encode_column(
df
column
drop_one_new_column
drop_old_column
Returns
-------
"""
dataset = Dataset(df_object=df)
drop_option = "first" if drop_one_new_column else None
one_hot_encoder = OneHotEncoder(
columns=[column], derived_column_suffix="_enc", drop_option=drop_option
)

encoded_dataset = one_hot_encoder(dataset)

new_columns = sorted(
derived_columns = sorted(
list(set(encoded_dataset.data.columns) - set(dataset.data.columns))
)
return encoded_dataset.data, one_hot_encoder.encoder, new_columns
return encoded_dataset, derived_columns


def _ordinal_encode_column(df, column, drop_old_column: bool = False):
def _ordinal_encode_column(
dataset: Dataset,
column: str,
):
"""
Parameters
----------
df
column
drop_old_column
Returns
-------
"""

dataset = Dataset(df_object=df)
derived_column = f"{column}_enc"
ordinal_encoder = OrdinalEncoder(columns=[column], derived_columns=[derived_column])

encoded_dataset = ordinal_encoder(dataset)
return encoded_dataset.data, ordinal_encoder.encoder, [derived_column]
return encoded_dataset, [derived_column]


def encode_single_categorical_column(
dataset: Dataset,
col_name: str,
encoding: Any = "EncodingFunctions.ORDINAL",
drop_one_new_column: bool = True,
drop_old_column: bool = False,
force: bool = False,
case_sensitive: bool = False,
):
Expand Down Expand Up @@ -268,42 +264,41 @@ def encode_single_categorical_column(
"""
# If the column has already been encoded and the new column has already been created, return dataset
enc_column = dataset.get_enc_column_from_original(column_name=col_name)
enc_column = dataset.encoded_columns_from_original(column=col_name)

# Check if encoding operation is required
if not force:
if enc_column is not None:
if len(enc_column) > 0:
logging.warning(
f"The column {col_name} has already been encoded "
f'as "{enc_column}". No further operations are performed '
)
return dataset
elif dataset[col_name].dtype.kind in "biufc":
logging.warning(
f"The column {col_name} is already numeric. No further operations are performed "
f'as "{enc_column}". No further operations are performed.'
)
return dataset
# elif dataset[col_name].dtype.kind in "biufc":
# logging.warning(
# f"The column {col_name} is already numeric. No further operations are performed "
# )
# return dataset

dataset_to_encode = dataset.copy()

df_to_encode = dataset.data.copy()
# Find index of rows with NaN and convert it to a fixed value so the corresponding encoded col will be dropped
nan_serie_map = df_to_encode[col_name].isna()
nan_serie_map = nan_serie_map.index[nan_serie_map].tolist()
df_to_encode.loc[nan_serie_map][col_name] = NAN_CATEGORY.title()
# Set to 'title' case so str with different capitalization are interpreted as equal
if not case_sensitive:
df_to_encode.loc[:, col_name] = df_to_encode[col_name].astype(str).str.title()
dataset_to_encode.data.loc[:, col_name] = (
dataset_to_encode.data[col_name].astype(str).str.title()
)

# Encoding using the selected function
if encoding == "ORDINAL":
df_encoded, encoder, new_columns = _ordinal_encode_column(
df_to_encode, column=col_name, drop_old_column=drop_old_column
dataset_encoded, _ = _ordinal_encode_column(
dataset_to_encode,
column=col_name,
)
elif encoding == "ONEHOT":
df_encoded, encoder, new_columns = _one_hot_encode_column(
df_to_encode,
dataset_encoded, _ = _one_hot_encode_column(
dataset_to_encode,
column=col_name,
drop_one_new_column=drop_one_new_column,
drop_old_column=drop_old_column,
)
else:
logging.error(
Expand All @@ -312,27 +307,6 @@ def encode_single_categorical_column(
)
return None

# Set the rows with missing values originally to NaN
df_encoded.loc[nan_serie_map, col_name] = pd.NA
df_encoded.loc[nan_serie_map, new_columns] = np.nan

# Generate encoded values map
encoded_values_map = {}
for val_id, val in enumerate(encoder.categories_[0]):
encoded_values_map[val_id] = val

dataset_encoded = copy_dataset_with_new_df(dataset, df_encoded)

dataset_encoded.track_history(
FeatureOperation(
original_columns=col_name,
operation_type="CATEGORICAL_ENCODING",
encoder=encoder,
encoded_values_map=encoded_values_map,
derived_columns=tuple(new_columns),
)
)

return dataset_encoded


Expand Down
12 changes: 5 additions & 7 deletions tests/integration/test_feature_fix.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import pandas as pd
import pytest
import sklearn.preprocessing as sk_preproc

import trousse.feature_fix as ffx
from trousse.dataset import Dataset
Expand All @@ -24,9 +23,9 @@ def test_ordinal_encode_column(csv, column, derived_column, expected_csv):
dataset = Dataset(data_file=csv)
expected_df = load_expectation(expected_csv, type_="csv")

encoded_df, _, new_cols = ffx._ordinal_encode_column(dataset.data, column, False)
encoded_dataset, new_cols = ffx._ordinal_encode_column(dataset, column)

pd.testing.assert_frame_equal(encoded_df, expected_df)
pd.testing.assert_frame_equal(encoded_dataset.data, expected_df)
assert derived_column == new_cols


Expand Down Expand Up @@ -55,10 +54,9 @@ def test_one_hot_encode_column(
dataset = Dataset(data_file=csv)
expected_df = load_expectation(expected_csv, type_="csv")

encoded_df, encoder, new_cols = ffx._one_hot_encode_column(
dataset.data, column, drop_one_new_column
encoded_dataset, new_cols = ffx._one_hot_encode_column(
dataset, column, drop_one_new_column
)

assert expected_new_cols == new_cols
pd.testing.assert_frame_equal(encoded_df, expected_df, check_dtype=False)
assert isinstance(encoder, sk_preproc.OneHotEncoder)
pd.testing.assert_frame_equal(encoded_dataset.data, expected_df, check_dtype=False)

0 comments on commit b2d991f

Please sign in to comment.