diff --git a/ecnet/data_utils.py b/ecnet/data_utils.py index f6cda41..812391a 100644 --- a/ecnet/data_utils.py +++ b/ecnet/data_utils.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # ecnet/data_utils.py -# v.1.5 +# v.1.5.1 # Developed in 2018 by Travis Kessler # # Contains the "DataFrame" class, and functions for processing/importing/ @@ -190,7 +190,7 @@ def shuffle(self, *args, split): ''' if 'l' and 'v' and 't' in args: - self.create_sets(split=split) + self.create_sets(random=True, split=split) elif 'l' and 'v' in args: lv_set = [] diff --git a/ecnet/limit_parameters.py b/ecnet/limit_parameters.py index 5144ed7..39e03e0 100644 --- a/ecnet/limit_parameters.py +++ b/ecnet/limit_parameters.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # ecnet/limit_parameters.py -# v.1.5 +# v.1.5.1 # Developed in 2018 by Travis Kessler # # Contains the functions necessary for reducing the input dimensionality of a @@ -20,7 +20,7 @@ import ecnet.error_utils -def limit_iterative_include(DataFrame, limit_num): +def limit_iterative_include(DataFrame, limit_num, print_feedback=True): ''' Limits the dimensionality of input data found in supplied *DataFrame* object to a dimensionality of *limit_num* using iterative inclusion @@ -114,15 +114,17 @@ def limit_iterative_include(DataFrame, limit_num): test_input_retained[idx].append(param[0]) retained_input_list.append(DataFrame.input_names[rmse_idx]) - print(retained_input_list) - print(rmse_val) - print() + if print_feedback: + print(retained_input_list) + print(rmse_val) + print() return retained_input_list def limit_genetic(DataFrame, limit_num, population_size, num_survivors, - num_generations, shuffle=False, print_feedback=True): + num_generations, shuffle=False, data_split=[0.65, 0.25, 0.1], + print_feedback=True): ''' Limits the dimensionality of input data found in supplied *DataFrame* object to a dimensionality of *limit_num* using a genetic algorithm. @@ -130,8 +132,9 @@ def limit_genetic(DataFrame, limit_num, population_size, num_survivors, *num_survivors* for selecting the best performers from each population generation to reproduce, *num_generations* for the number of times the population will reproduce, *shuffle* for shuffling the data sets for each - population member, and *print_feedback* for printing the average fitness - score of the population after each generation. + population member, *data_split* to determine l/v/t splits if shuffling, + and *print_feedback* for printing the average fitness score of the + population after each generation. ''' def ecnet_limit_inputs(feed_dict): @@ -147,7 +150,7 @@ def ecnet_limit_inputs(feed_dict): test_input = [] if shuffle: - DataFrame.shuffle('l', 'v', 't') + DataFrame.shuffle('l', 'v', 't', split=data_split) packaged_data_cf = DataFrame.package_sets() else: packaged_data_cf = packaged_data diff --git a/ecnet/server.py b/ecnet/server.py index 0231890..ebfdc5f 100644 --- a/ecnet/server.py +++ b/ecnet/server.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # ecnet/server.py -# v.1.5 +# v.1.5.1 # Developed in 2018 by Travis Kessler # # Contains the "Server" class, which handles ECNet project creation, neural @@ -128,7 +128,7 @@ def import_data(self, data_filename, sort_type='random', def limit_input_parameters(self, limit_num, output_filename, use_genetic=False, population_size=500, num_survivors=200, num_generations=25, - shuffle=False): + shuffle=False, data_split=[0.65, 0.25, 0.1]): ''' Limits the input dimensionality of currently loaded DataFrame; default method is an iterative inclusion algorithm, options for using a genetic @@ -144,20 +144,36 @@ def limit_input_parameters(self, limit_num, output_filename, *num_generations* - number of generations the algorithm will run for *shuffle* - whether to shuffle learning, validation and testing sets for each population member + *data_split* - if shuffling, learning/validation/testing data + is split using this argument See https://github.com/tjkessler/pygenetics for genetic algorithm source code. ''' if use_genetic: - params = ecnet.limit_parameters.limit_genetic( - self.DataFrame, limit_num, population_size, num_survivors, - shuffle=shuffle, print_feedback=self.__print_feedback - ) + try: + params = ecnet.limit_parameters.limit_genetic( + self.DataFrame, limit_num, population_size, + num_survivors, num_generations, shuffle=shuffle, + print_feedback=self.__print_feedback + ) + except: + params = ecnet.limit_parameters.limit_genetic( + self.DataFrame, limit_num, population_size, num_survivors, + num_generations, shuffle=shuffle, data_split=data_split, + print_feedback=True + ) else: - params = ecnet.limit_parameters.limit_iterative_include( - self.DataFrame, limit_num - ) + try: + params = ecnet.limit_parameters.limit_iterative_include( + self.DataFrame, limit_num, + print_feedback=self.__print_feedback + ) + except: + params = ecnet.limit_parameters.limit_iterative_include( + self.DataFrame, limit_num, print_feedback=True + ) ecnet.limit_parameters.output(self.DataFrame, params, output_filename) def tune_hyperparameters(self, target_score=None, iteration_amt=50, diff --git a/examples/limit_input_parameters.py b/examples/limit_input_parameters.py index 7004ecd..87e6ede 100644 --- a/examples/limit_input_parameters.py +++ b/examples/limit_input_parameters.py @@ -8,9 +8,9 @@ sv.import_data('my_data.csv') # Limit the input dimensionality to 15, save to 'my_data_limited.csv' -sv.limit_parameters(15, 'my_data_limited.csv') +sv.limit_input_parameters(15, 'my_data_limited.csv') # Use this line instead for limiting the input dimensionality using a genetic # algorithm -sv.limit_parameters(15, 'my_data_limited_genetic.csv', use_genetic=True) +sv.limit_input_parameters(15, 'my_data_limited_genetic.csv', use_genetic=True)