diff --git a/extractors.py b/extractors.py index fc0b8d1..b088b2d 100644 --- a/extractors.py +++ b/extractors.py @@ -251,7 +251,7 @@ def apply_tags(data, tags, base_tags=None, additional_tags=[], minimal=True): @staticmethod - def convert_columns_to_category(data, additional_columns:list = [], excluded_columns:set = {}): + def convert_columns_to_category(data, additional_columns:list = [], excluded_columns:set = {}, numerical_columns:set = {}): excluded_columns = set(excluded_columns).union(DEFAULT_CATEGORICALS_COLUMN_EXCLUSION_SET) col_list = [] @@ -265,11 +265,16 @@ def convert_columns_to_category(data, additional_columns:list = [], excluded_col if s < threshold: col_list.append(col) + logd(f"{excluded_columns}=") + logd(f"{col_list}=") # convert selected columns to Categorical for col in col_list: data[col] = data[col].astype('category') data[col] = data[col].cat.as_ordered() + for col in numerical_columns: + data[col] = data[col].astype('float') + return data diff --git a/plots.py b/plots.py index 9b086e0..2bf6afe 100644 --- a/plots.py +++ b/plots.py @@ -81,7 +81,7 @@ def read_data(self): data_list = list(map(dask.delayed(functools.partial(read_from_file, sample=self.sample, sample_seed=self.sample_seed, filter_query=self.filter_query)) , data_set.get_file_list())) concat_result = dask.delayed(pd.concat)(data_list) - convert_columns_result = dask.delayed(RawExtractor.convert_columns_to_category)(concat_result, excluded_columns=self.numerical_columns) + convert_columns_result = dask.delayed(RawExtractor.convert_columns_to_category)(concat_result, numerical_columns=self.numerical_columns) logd(f'PlottingReaderFeather::read_data: {data_list=}') logd(f'PlottingReaderFeather::read_data: {convert_columns_result=}') # d = dask.compute(convert_columns_result)