Skip to content

Commit

Permalink
Added support for shuffling data sets during parameter limiting proce…
Browse files Browse the repository at this point in the history
…ss, after each inclusion or after each generation (if using genetic algorithm)
  • Loading branch information
tjkessler committed Jul 8, 2018
1 parent df68d18 commit cdb867a
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 13 deletions.
36 changes: 26 additions & 10 deletions ecnet/limit_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
#
# ecnet/limit_parameters.py
# v.1.4.2
# v.1.4.3
# Developed in 2018 by Travis Kessler <[email protected]>
#
# This program contains the functions necessary for reducing the input dimensionality of a
Expand All @@ -21,9 +21,10 @@

'''
Limits the dimensionality of input data found in supplied *DataFrame* object to a
dimensionality of *limit_num*
dimensionality of *limit_num* using iterative inclusion; optional argument of
*shuffle* for shuffling the data sets after each inclusion
'''
def limit_iterative_include(DataFrame, limit_num):
def limit_iterative_include(DataFrame, limit_num, shuffle = False):

# List of retained input parameters
retained_input_list = []
Expand All @@ -33,20 +34,25 @@ def limit_iterative_include(DataFrame, limit_num):
valid_input_retained = []
test_input_retained = []

# Obtain Numpy arrays for learning, validation, testing sets
packaged_data = DataFrame.package_sets()

# Until specified number of paramters *limit_num* are retained
while len(retained_input_list) < limit_num:

# List of RMSE's for currently retained inputs + new inputs to test
retained_rmse_list = []

# For all input paramters to test
for idx, param in enumerate(DataFrame.input_names):

# If shuffling the data sets after each inclusion
if shuffle:
# Shuffle all sets
DataFrame.shuffle('l', 'v', 't')
# Obtain Numpy arrays for learning, validation, testing sets
packaged_data = DataFrame.package_sets()

# For all input paramters to test
for idx, param in enumerate(DataFrame.input_names):

# Obtain input parameter column for learning, validation and test sets
learn_input_add = [[sublist[idx]] for sublist in packaged_data.learn_x]
valid_input_add = [[sublist[idx]] for sublist in packaged_data.valid_x]
Expand Down Expand Up @@ -130,10 +136,11 @@ def limit_iterative_include(DataFrame, limit_num):
dimensionality of *limit_num* using a genetic algorithm. Optional arguments for
*population_size* of genetic algorithm's population, *num_survivors* for selecting
the best performers from each population generation to reproduce, *num_generations*
for the number of times the population will reproduce, and *print_feedback* for
printing the average fitness score of the population after each generation.
for the number of times the population will reproduce, *shuffle* for shuffling the
data sets after each generation, and *print_feedback* for printing the average
fitness score of the population after each generation.
'''
def limit_genetic(DataFrame, limit_num, population_size, num_survivors, num_generations, print_feedback = True):
def limit_genetic(DataFrame, limit_num, population_size, num_survivors, num_generations, shuffle = False, print_feedback = True):

'''
Genetic algorithm cost function, supplied to the genetic algorithm; returns the RMSE
Expand All @@ -148,7 +155,7 @@ def ecnet_limit_inputs(feed_dict):
test_input = []

# For the input parameters chosen by the genetic algorithm:
for idx, param in enumerate(feed_dict):
for param in feed_dict:

# Grab the input parameter
learn_input_add = [[sublist[feed_dict[param]]] for sublist in packaged_data.learn_x]
Expand Down Expand Up @@ -214,6 +221,15 @@ def minimize_best_n(members, n):

# Run the genetic algorithm for *num_generations* generations
for gen in range(num_generations):

# If shuffling data sets between generations
if shuffle:
# Shuffle all sets
DataFrame.shuffle('l', 'v', 't')
# Obtain Numpy arrays for learning, validation, testing sets
packaged_data = DataFrame.package_sets()

# Next generation
population.next_generation(num_survivors = num_survivors, mut_rate = 0)
if print_feedback:
print('Generation: ' + str(gen + 1) + ' - Population fitness: ' + str(sum(p.fitness_score for p in population.members) / len(population)))
Expand Down
14 changes: 11 additions & 3 deletions ecnet/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
#
# ecnet/server.py
# v.1.4.2
# v.1.4.3
# Developed in 2018 by Travis Kessler <[email protected]>
#
# This file contains the "Server" class, which handles ECNet project creation,
Expand Down Expand Up @@ -130,10 +130,18 @@ def import_data(self, data_filename = None):
# Package sets for model hand-off
self.packaged_data = self.DataFrame.package_sets()

def limit_parameters(self, limit_num, output_filename, use_genetic = False, population_size = 500, num_survivors = 200, num_generations = 25):
'''
Limits the input dimensionality of the currently loaded DataFrame to a dimension of *limit_num*.
Saves the resulting limited DataFrame to *output_filename*. Option to *shuffle* data sets between
inclusions/after each generation if using genetic algorithm. *use_genetic* allows for using a
genetic algorithm to limit the dimensionality (default to iterative inclusion), with arguments
for genetic algorithm *population_size*, *num_survivors* of each generation, and the number of
generations *num_generations* (PyGenetics package).
'''
def limit_parameters(self, limit_num, output_filename, use_genetic = False, population_size = 500, num_survivors = 200, num_generations = 25, shuffle = False):

if use_genetic:
params = ecnet.limit_parameters.limit_genetic(self.DataFrame, limit_num, population_size, num_survivors, num_generations, print_feedback = self.vars['project_print_feedback'])
params = ecnet.limit_parameters.limit_genetic(self.DataFrame, limit_num, population_size, num_survivors, num_generations, shuffle = shuffle, print_feedback = self.vars['project_print_feedback'])
else:
params = ecnet.limit_parameters.limit_iterative_include(self.DataFrame, limit_num)
ecnet.limit_parameters.output(self.DataFrame, params, output_filename)
Expand Down

0 comments on commit cdb867a

Please sign in to comment.