Added support for shuffling data sets during parameter limiting proce…

…ss, after each inclusion or after each generation (if using genetic algorithm)
ecrl · Jul 8, 2018 · cdb867a · cdb867a
1 parent df68d18
commit cdb867a
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 13 deletions.
diff --git a/ecnet/limit_parameters.py b/ecnet/limit_parameters.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 #
 #  ecnet/limit_parameters.py
-#  v.1.4.2
+#  v.1.4.3
 #  Developed in 2018 by Travis Kessler <[email protected]>
 #  
 #  This program contains the functions necessary for reducing the input dimensionality of a 
@@ -21,9 +21,10 @@
 
 '''
 Limits the dimensionality of input data found in supplied *DataFrame* object to a
-dimensionality of *limit_num*
+dimensionality of *limit_num* using iterative inclusion; optional argument of 
+*shuffle* for shuffling the data sets after each inclusion
 '''
-def limit_iterative_include(DataFrame, limit_num):
+def limit_iterative_include(DataFrame, limit_num, shuffle = False):
 
 	# List of retained input parameters
 	retained_input_list = []
@@ -33,20 +34,25 @@ def limit_iterative_include(DataFrame, limit_num):
 	valid_input_retained = []
 	test_input_retained = []
 
+	# Obtain Numpy arrays for learning, validation, testing sets
+	packaged_data = DataFrame.package_sets()
+
 	# Until specified number of paramters *limit_num* are retained
 	while len(retained_input_list) < limit_num:
 
 		# List of RMSE's for currently retained inputs + new inputs to test
 		retained_rmse_list = []
 
-		# For all input paramters to test
-		for idx, param in enumerate(DataFrame.input_names):
-
+		# If shuffling the data sets after each inclusion
+		if shuffle:
 			# Shuffle all sets
 			DataFrame.shuffle('l', 'v', 't')
 			# Obtain Numpy arrays for learning, validation, testing sets
 			packaged_data = DataFrame.package_sets()
 
+		# For all input paramters to test
+		for idx, param in enumerate(DataFrame.input_names):
+
 			# Obtain input parameter column for learning, validation and test sets
 			learn_input_add = [[sublist[idx]] for sublist in packaged_data.learn_x]
 			valid_input_add = [[sublist[idx]] for sublist in packaged_data.valid_x]
@@ -130,10 +136,11 @@ def limit_iterative_include(DataFrame, limit_num):
 dimensionality of *limit_num* using a genetic algorithm. Optional arguments for
 *population_size* of genetic algorithm's population, *num_survivors* for selecting
 the best performers from each population generation to reproduce, *num_generations*
-for the number of times the population will reproduce, and *print_feedback* for 
-printing the average fitness score of the population after each generation.
+for the number of times the population will reproduce, *shuffle* for shuffling the
+data sets after each generation, and *print_feedback* for printing the average 
+fitness score of the population after each generation.
 '''
-def limit_genetic(DataFrame, limit_num, population_size, num_survivors, num_generations, print_feedback = True):
+def limit_genetic(DataFrame, limit_num, population_size, num_survivors, num_generations, shuffle = False, print_feedback = True):
 
 	'''
 	Genetic algorithm cost function, supplied to the genetic algorithm; returns the RMSE
@@ -148,7 +155,7 @@ def ecnet_limit_inputs(feed_dict):
 		test_input = []
 
 		# For the input parameters chosen by the genetic algorithm:
-		for idx, param in enumerate(feed_dict):
+		for param in feed_dict:
 
 			# Grab the input parameter
 			learn_input_add = [[sublist[feed_dict[param]]] for sublist in packaged_data.learn_x]
@@ -214,6 +221,15 @@ def minimize_best_n(members, n):
 
 	# Run the genetic algorithm for *num_generations* generations
 	for gen in range(num_generations):
+
+		# If shuffling data sets between generations
+		if shuffle:
+			# Shuffle all sets
+			DataFrame.shuffle('l', 'v', 't')
+			# Obtain Numpy arrays for learning, validation, testing sets
+			packaged_data = DataFrame.package_sets()
+
+		# Next generation
 		population.next_generation(num_survivors = num_survivors, mut_rate = 0)
 		if print_feedback:
 			print('Generation: ' + str(gen + 1) + ' - Population fitness: ' + str(sum(p.fitness_score for p in population.members) / len(population)))

diff --git a/ecnet/server.py b/ecnet/server.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 #
 #  ecnet/server.py
-#  v.1.4.2
+#  v.1.4.3
 #  Developed in 2018 by Travis Kessler <[email protected]>
 #
 #  This file contains the "Server" class, which handles ECNet project creation,
@@ -130,10 +130,18 @@ def import_data(self, data_filename = None):
 		# Package sets for model hand-off
 		self.packaged_data = self.DataFrame.package_sets()
 
-	def limit_parameters(self, limit_num, output_filename, use_genetic = False, population_size = 500, num_survivors = 200, num_generations = 25):
+	'''
+	Limits the input dimensionality of the currently loaded DataFrame to a dimension of *limit_num*.
+	Saves the resulting limited DataFrame to *output_filename*. Option to *shuffle* data sets between
+	inclusions/after each generation if using genetic algorithm. *use_genetic* allows for using a 
+	genetic algorithm to limit the dimensionality (default to iterative inclusion), with arguments 
+	for genetic algorithm *population_size*, *num_survivors* of each generation, and the number of 
+	generations *num_generations* (PyGenetics package).
+	'''
+	def limit_parameters(self, limit_num, output_filename, use_genetic = False, population_size = 500, num_survivors = 200, num_generations = 25, shuffle = False):
 
 		if use_genetic:
-			params = ecnet.limit_parameters.limit_genetic(self.DataFrame, limit_num, population_size, num_survivors, num_generations, print_feedback = self.vars['project_print_feedback'])
+			params = ecnet.limit_parameters.limit_genetic(self.DataFrame, limit_num, population_size, num_survivors, num_generations, shuffle = shuffle, print_feedback = self.vars['project_print_feedback'])
 		else:
 			params = ecnet.limit_parameters.limit_iterative_include(self.DataFrame, limit_num)
 		ecnet.limit_parameters.output(self.DataFrame, params, output_filename)