From 7fcfcee7edbcd1c74350b7edb5a8dfd3dcf7a3d7 Mon Sep 17 00:00:00 2001 From: tjkessler Date: Mon, 6 Jan 2020 16:13:55 -0500 Subject: [PATCH] Version bump: 3.2.3 -> 3.3.0 --- ecnet/models/mlp.py | 8 +++-- ecnet/server.py | 50 +++++++++++++++---------------- ecnet/tasks/limit_inputs.py | 8 ++--- ecnet/tasks/training.py | 4 +-- ecnet/tasks/tuning.py | 12 ++++---- ecnet/tools/database.py | 12 ++++---- ecnet/tools/plotting.py | 17 ++++++----- ecnet/tools/project.py | 8 ++--- ecnet/utils/data_utils.py | 18 +++++------ ecnet/utils/error_utils.py | 4 +-- ecnet/utils/logging.py | 4 +-- ecnet/utils/server_utils.py | 17 ++++++----- ecnet/workflows/ecrl_workflow.py | 36 ++++++++++++++-------- ecnet/workflows/workflow_utils.py | 18 +++++++++-- 14 files changed, 122 insertions(+), 94 deletions(-) diff --git a/ecnet/models/mlp.py b/ecnet/models/mlp.py index 863f897..2755e99 100644 --- a/ecnet/models/mlp.py +++ b/ecnet/models/mlp.py @@ -2,18 +2,19 @@ # -*- coding: utf-8 -*- # # ecnet/models/mlp.py -# v.3.2.3 -# Developed in 2019 by Travis Kessler +# v.3.3.0 +# Developed in 2020 by Travis Kessler # # Contains the "MultilayerPerceptron" (feed-forward neural network) class # +# Stdlib imports from os import environ from re import compile, IGNORECASE +# 3rd party imports from h5py import File from numpy import array, string_, zeros - from tensorflow import config, Tensor from tensorflow.keras.callbacks import EarlyStopping from tensorflow.keras.layers import Dense @@ -21,6 +22,7 @@ from tensorflow.keras.models import Model from tensorflow.keras.optimizers import Adam +# ECNet imports from ecnet.utils.logging import logger environ['TF_CPP_MIN_LOG_LEVEL'] = '2' diff --git a/ecnet/server.py b/ecnet/server.py index 216c2e0..c72c1b0 100644 --- a/ecnet/server.py +++ b/ecnet/server.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- # # ecnet/server.py -# v.3.2.3 -# Developed in 2019 by Travis Kessler +# v.3.3.0 +# Developed in 2020 by Travis Kessler # # Contains the "Server" class, which handles ECNet project creation, neural # network model creation, data hand-off to models, prediction error @@ -26,8 +26,8 @@ class Server: - def __init__(self, model_config: str='config.yml', prj_file: str=None, - num_processes: int=1): + def __init__(self, model_config: str = 'config.yml', prj_file: str = None, + num_processes: int = 1): '''Server object: handles data loading, model creation, data-to-model hand-off, data input parameter selection, hyperparameter tuning @@ -69,8 +69,8 @@ def __init__(self, model_config: str='config.yml', prj_file: str=None, self._vars = default_config() save_config(self._vars, self._cf_file) - def load_data(self, filename: str, random: bool=False, split: list=None, - normalize: bool=False): + def load_data(self, filename: str, random: bool = False, + split: list = None, normalize: bool = False): '''Loads data from an ECNet-formatted CSV database Args: @@ -90,8 +90,8 @@ def load_data(self, filename: str, random: bool=False, split: list=None, self._df.create_sets(random, split) self._sets = self._df.package_sets() - def create_project(self, project_name: str, num_pools: int=1, - num_candidates: int=1): + def create_project(self, project_name: str, num_pools: int = 1, + num_candidates: int = 1): '''Creates folder hierarchy for a new project Args: @@ -111,8 +111,8 @@ def create_project(self, project_name: str, num_pools: int=1, logger.log('debug', 'Number of candidates/pool: {}'.format( num_candidates), call_loc='PROJECT') - def limit_inputs(self, limit_num: int, num_estimators: int=None, - eval_set: str='learn', output_filename: str=None, + def limit_inputs(self, limit_num: int, num_estimators: int = None, + eval_set: str = 'learn', output_filename: str = None, **kwargs) -> list: '''Selects `limit_num` influential input parameters using random forest regression @@ -149,9 +149,9 @@ def limit_inputs(self, limit_num: int, num_estimators: int=None, return result def tune_hyperparameters(self, num_employers: int, num_iterations: int, - shuffle: bool=None, split: list=None, - validate: bool=True, eval_set: str=None, - eval_fn: str='rmse', epochs: int=300): + shuffle: bool = None, split: list = None, + validate: bool = True, eval_set: str = None, + eval_fn: str = 'rmse', epochs: int = 300): '''Tunes neural network learning hyperparameters using an artificial bee colony algorithm; tuned hyperparameters are saved to Server's model configuration file @@ -185,10 +185,10 @@ def tune_hyperparameters(self, num_employers: int, num_iterations: int, ) save_config(self._vars, self._cf_file) - def train(self, shuffle: str=None, split: list=None, retrain: bool=False, - validate: bool=False, selection_set: str=None, - selection_fn: str='rmse', model_filename: str='model.h5', - verbose=0) -> list: + def train(self, shuffle: str = None, split: list = None, + retrain: bool = False, validate: bool = False, + selection_set: str = None, selection_fn: str = 'rmse', + model_filename: str = 'model.h5', verbose: int = 0) -> tuple: '''Trains neural network(s) using currently-loaded data; single NN if no project is created, all candidates if created @@ -210,8 +210,8 @@ def train(self, shuffle: str=None, split: list=None, retrain: bool=False, model only) Returns: - list: if training single model, returns list of learn/valid losses, - else None + tuple: if training single model, returns tuple of learn/valid + losses, else None ''' if self._prj_name is None: @@ -246,8 +246,8 @@ def train(self, shuffle: str=None, split: list=None, retrain: bool=False, ) return None - def use(self, dset: str=None, output_filename: str=None, - model_filename: str='model.h5') -> list: + def use(self, dset: str = None, output_filename: str = None, + model_filename: str = 'model.h5') -> list: '''Uses trained neural network(s) to predict for specified set; single NN if no project created, best pool candidates if created @@ -277,8 +277,8 @@ def use(self, dset: str=None, output_filename: str=None, call_loc='USE') return results - def errors(self, *args, dset: str=None, - model_filename: str='model.h5') -> dict: + def errors(self, *args, dset: str = None, + model_filename: str = 'model.h5') -> dict: '''Obtains various errors for specified set Args: @@ -304,8 +304,8 @@ def errors(self, *args, dset: str=None, logger.log('debug', 'Errors: {}'.format(errors), call_loc='ERRORS') return errors - def save_project(self, filename: str=None, clean_up: bool=True, - del_candidates: bool=False): + def save_project(self, filename: str = None, clean_up: bool = True, + del_candidates: bool = False): '''Saves current state of project to a .prj file Args: diff --git a/ecnet/tasks/limit_inputs.py b/ecnet/tasks/limit_inputs.py index 58fa8ae..48b1d74 100644 --- a/ecnet/tasks/limit_inputs.py +++ b/ecnet/tasks/limit_inputs.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- # # ecnet/tasks/limit_inputs.py -# v.3.2.3 -# Developed in 2019 by Travis Kessler +# v.3.3.0 +# Developed in 2020 by Travis Kessler # # Contains functions for selecting influential input parameters # @@ -21,8 +21,8 @@ from ecnet.utils.server_utils import get_x, get_y -def limit_rforest(df: DataFrame, limit_num: int, num_estimators: int=None, - num_processes: int=1, eval_set: str='learn', +def limit_rforest(df: DataFrame, limit_num: int, num_estimators: int = None, + num_processes: int = 1, eval_set: str = 'learn', **kwargs) -> list: '''Uses random forest regression to select input parameters diff --git a/ecnet/tasks/training.py b/ecnet/tasks/training.py index 0f7b702..252b14f 100644 --- a/ecnet/tasks/training.py +++ b/ecnet/tasks/training.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- # # ecnet/tasks/training.py -# v.3.2.3 -# Developed in 2019 by Travis Kessler +# v.3.3.0 +# Developed in 2020 by Travis Kessler # # Contains function for project training (multiprocessed training) # diff --git a/ecnet/tasks/tuning.py b/ecnet/tasks/tuning.py index 65e0a08..3792a6d 100644 --- a/ecnet/tasks/tuning.py +++ b/ecnet/tasks/tuning.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- # # ecnet/tasks/tuning.py -# v.3.2.3 -# Developed in 2019 by Travis Kessler +# v.3.3.0 +# Developed in 2020 by Travis Kessler # # Contains functions/fitness functions for tuning hyperparameters # @@ -22,10 +22,10 @@ def tune_hyperparameters(df: DataFrame, vars: dict, num_employers: int, - num_iterations: int, num_processes: int=1, - shuffle: str=None, split: list=None, - validate: bool=True, eval_set: str=None, - eval_fn: str='rmse', epochs: int=300) -> dict: + num_iterations: int, num_processes: int = 1, + shuffle: str = None, split: list = None, + validate: bool = True, eval_set: str = None, + eval_fn: str = 'rmse', epochs: int = 300) -> dict: '''Tunes neural network learning/architecture hyperparameters Args: diff --git a/ecnet/tools/database.py b/ecnet/tools/database.py index 5385481..4be7e76 100644 --- a/ecnet/tools/database.py +++ b/ecnet/tools/database.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- # # ecnet/tools/database.py -# v.3.2.3 -# Developed in 2019 by Travis Kessler +# v.3.3.0 +# Developed in 2020 by Travis Kessler # # Contains functions for creating ECNet-formatted databases # @@ -20,7 +20,7 @@ try: import pybel -except: +except ImportError: pybel = None @@ -35,9 +35,9 @@ def __init__(self, id): self.inputs = None -def create_db(smiles: list, db_name: str, targets: list=None, - id_prefix: str='', extra_strings: dict={}, backend: str='padel', - convert_mdl: bool=False): +def create_db(smiles: list, db_name: str, targets: list = None, + id_prefix: str = '', extra_strings: dict = {}, + backend: str = 'padel', convert_mdl: bool = False): ''' create_db: creates an ECNet-formatted database from SMILES strings using either PaDEL-Descriptor or alvaDesc software; using alvaDesc requires a valid installation/license of alvaDesc diff --git a/ecnet/tools/plotting.py b/ecnet/tools/plotting.py index 441db81..895aafb 100644 --- a/ecnet/tools/plotting.py +++ b/ecnet/tools/plotting.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- # # ecnet/tools/plotting.py -# v.3.2.3 -# Developed in 2019 by Travis Kessler +# v.3.3.0 +# Developed in 2020 by Travis Kessler # # Contains functions/classes for creating various plots # @@ -18,9 +18,10 @@ class ParityPlot: - def __init__(self, title: str='Parity Plot', - x_label: str='Experimental Value', - y_label: str='Predicted Value', font: str='Times New Roman'): + def __init__(self, title: str = 'Parity Plot', + x_label: str = 'Experimental Value', + y_label: str = 'Predicted Value', + font: str = 'Times New Roman'): ''' ParityPlot: creates a plot of predicted values vs. experimental data relative to a 1:1 parity line @@ -39,7 +40,7 @@ def __init__(self, title: str='Parity Plot', self._min_val = 0 self._labels = None - def add_series(self, x_vals, y_vals, name: str=None, color: str=None): + def add_series(self, x_vals, y_vals, name: str = None, color: str = None): ''' Adds data to the plot Args: @@ -67,7 +68,7 @@ def add_series(self, x_vals, y_vals, name: str=None, color: str=None): if y_min < self._min_val: self._min_val = y_min - def add_error_bars(self, error: float, label: str=None): + def add_error_bars(self, error: float, label: str = None): ''' Adds error bars, +/- the error relative to the 1:1 parity line Args: @@ -96,7 +97,7 @@ def save(self, filename: str): self._add_parity_line() plt.savefig(filename) - def _add_parity_line(self, offset: float=0.0): + def _add_parity_line(self, offset: float = 0.0): ''' Adds a 1:1 parity line Args: diff --git a/ecnet/tools/project.py b/ecnet/tools/project.py index b09a7d2..ba00b57 100644 --- a/ecnet/tools/project.py +++ b/ecnet/tools/project.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- # # ecnet/tools/project.py -# v.3.2.3 -# Developed in 2019 by Travis Kessler +# v.3.3.0 +# Developed in 2020 by Travis Kessler # # Contains functions for predicting data using pre-existing .prj files # @@ -20,8 +20,8 @@ from ecnet.tools.database import create_db -def predict(smiles: list, prj_file: str, results_file: str=None, - backend: str='padel') -> list: +def predict(smiles: list, prj_file: str, results_file: str = None, + backend: str = 'padel') -> list: ''' predict: predicts values for supplied molecules (SMILES strings) using pre-existing ECNet project (.prj) file diff --git a/ecnet/utils/data_utils.py b/ecnet/utils/data_utils.py index cf4e29d..0a32407 100644 --- a/ecnet/utils/data_utils.py +++ b/ecnet/utils/data_utils.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- # # ecnet/utils/data_utils.py -# v.3.2.3 -# Developed in 2019 by Travis Kessler +# v.3.3.0 +# Developed in 2020 by Travis Kessler # # Contains functions/classes for loading data, saving data, saving results # @@ -33,8 +33,8 @@ def __init__(self): class PackagedData: def __init__(self): - '''PackagedData object: contains lists of input and target data for data - set assignments + '''PackagedData object: contains lists of input and target data for + data set assignments ''' self.learn_x = [] @@ -112,7 +112,7 @@ def __len__(self): return len(self.data_points) - def create_sets(self, random: bool=False, split: list=[0.7, 0.2, 0.1]): + def create_sets(self, random: bool = False, split: list = [0.7, 0.2, 0.1]): '''Creates learning, validation and test sets Args: @@ -167,9 +167,9 @@ def create_sets(self, random: bool=False, split: list=[0.7, 0.2, 0.1]): logger.log('debug', 'Number of entries in test set: {}'.format( len(self.test_set)), call_loc='DF') - def create_sorted_sets(self, sort_str: str, split: list=[0.7, 0.2, 0.1]): - '''Creates random learn, validate and test sets, ensuring data points with - the supplied sort string are split proportionally between the sets + def create_sorted_sets(self, sort_str: str, split: list = [0.7, 0.2, 0.1]): + '''Creates random learn, validate and test sets, ensuring data points + with the supplied sort string are split proportionally between the sets Args: sort_str (str): database STRING value used to sort data points @@ -239,7 +239,7 @@ def normalize(self): (float(getattr(pt, inp)) - v_min) / (v_max - v_min) ) - def shuffle(self, sets: str='all', split: list=[0.7, 0.2, 0.1]): + def shuffle(self, sets: str = 'all', split: list = [0.7, 0.2, 0.1]): '''Shuffles learning, validation and test sets or learning and validation sets diff --git a/ecnet/utils/error_utils.py b/ecnet/utils/error_utils.py index 4546c39..16cbf1b 100644 --- a/ecnet/utils/error_utils.py +++ b/ecnet/utils/error_utils.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- # # ecnet/utils/error_utils.py -# v.3.2.3 -# Developed in 2019 by Travis Kessler +# v.3.3.0 +# Developed in 2020 by Travis Kessler # # Contains functions for error calculations # diff --git a/ecnet/utils/logging.py b/ecnet/utils/logging.py index f4cc13d..32798bc 100644 --- a/ecnet/utils/logging.py +++ b/ecnet/utils/logging.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- # # ecnet/utils/logging.py -# v.3.2.3 -# Developed in 2019 by Travis Kessler +# v.3.3.0 +# Developed in 2020 by Travis Kessler # # Contains logger used by ECNet # diff --git a/ecnet/utils/server_utils.py b/ecnet/utils/server_utils.py index f6d7e82..4d2b1f7 100644 --- a/ecnet/utils/server_utils.py +++ b/ecnet/utils/server_utils.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- # # ecnet/utils/server_utils.py -# v.3.2.3 -# Developed in 2019 by Travis Kessler +# v.3.3.0 +# Developed in 2020 by Travis Kessler # # Contains functions used by ecnet.Server # @@ -80,8 +80,8 @@ def default_config() -> dict: } -def get_candidate_path(prj: str, pool: int, candidate: int=None, - model: bool=False, p_best: bool=False) -> str: +def get_candidate_path(prj: str, pool: int, candidate: int = None, + model: bool = False, p_best: bool = False) -> str: '''Get path to various states of model.h5 files Args: @@ -367,8 +367,9 @@ def save_project(prj_name: str, filename: str, config_filename: str, def train_model(sets: PackagedData, vars: dict, eval_set: str, eval_fn: str, - retrain: bool=False, filename: str='model.h5', - validate: bool=True, save: bool=True, verbose: int=0) -> tuple: + retrain: bool = False, filename: str = 'model.h5', + validate: bool = True, save: bool = True, + verbose: int = 0) -> tuple: '''Trains neural network Args: @@ -385,7 +386,7 @@ def train_model(sets: PackagedData, vars: dict, eval_set: str, eval_fn: str, model only) Returns: - tuple: (error of evaluated set, list of learn/valid losses) + tuple: (error of evaluated set, tuple of learn/valid losses) ''' model = MultilayerPerceptron(filename=filename) @@ -438,7 +439,7 @@ def train_model(sets: PackagedData, vars: dict, eval_set: str, eval_fn: str, def use_model(sets: PackagedData, dset: str, - filename: str='model.h5') -> array: + filename: str = 'model.h5') -> array: '''Uses existing model to predict data Args: diff --git a/ecnet/workflows/ecrl_workflow.py b/ecnet/workflows/ecrl_workflow.py index 5d4221e..c85e0ac 100644 --- a/ecnet/workflows/ecrl_workflow.py +++ b/ecnet/workflows/ecrl_workflow.py @@ -1,25 +1,37 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# ecnet/workflows/ecrl_workflow.py +# v.3.3.0 +# Developed in 2020 by Travis Kessler +# +# General workflow used by the UMass Lowell Energy and Combustion Research +# Laboratory +# + +# Stdlib imports +from datetime import datetime + +# 3rd party imports +from matplotlib import pyplot as plt + +# ECNet imports from ecnet import Server from ecnet.tasks.tuning import tune_hyperparameters from ecnet.tools.database import create_db from ecnet.tools.plotting import ParityPlot from ecnet.utils.data_utils import DataFrame -from ecnet.utils.error_utils import calc_med_abs_error, calc_r2 from ecnet.utils.logging import logger -from ecnet.utils.server_utils import default_config, save_config, train_model,\ - use_model +from ecnet.utils.server_utils import default_config, save_config from ecnet.workflows.workflow_utils import find_optimal_num_inputs,\ prop_range_from_split -from datetime import datetime - -from matplotlib import pyplot as plt - -def create_model(prop_abvr: str, smiles: list=None, targets: list=None, - db_name: str=None, qspr_backend: str='padel', - create_plots: bool=True, data_split: list=[0.7, 0.2, 0.1], - log_level: str='info', log_to_file: bool=True, - num_processes: int=1): +def create_model(prop_abvr: str, smiles: list = None, targets: list = None, + db_name: str = None, qspr_backend: str = 'padel', + create_plots: bool = True, data_split: list = [0.7, 0.2, 0.1], + log_level: str = 'info', log_to_file: bool = True, + num_processes: int = 1): ''' create_model: ECRL's database/model creation workflow for all publications diff --git a/ecnet/workflows/workflow_utils.py b/ecnet/workflows/workflow_utils.py index 8d0c473..6333306 100644 --- a/ecnet/workflows/workflow_utils.py +++ b/ecnet/workflows/workflow_utils.py @@ -1,10 +1,22 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# ecnet/workflows/workflow_utils.py +# v.3.3.0 +# Developed in 2020 by Travis Kessler +# +# Functions used by the ECRL workflow +# + +# Stdlib imports +from multiprocessing import Pool, set_start_method +from os import name + +# ECNet imports from ecnet.utils.data_utils import DataFrame from ecnet.utils.server_utils import default_config, train_model from ecnet.tasks.limit_inputs import limit_rforest -from multiprocessing import Pool, set_start_method -from os import name - def prop_range_from_split(db_name: str, data_split: list): ''' prop_range_from_split: creates learning, validation, test subsets, each