diff --git a/tpot2/config/autoqtl_builtins.py b/tpot2/config/autoqtl_builtins.py index d4638894..d3cc8dfc 100644 --- a/tpot2/config/autoqtl_builtins.py +++ b/tpot2/config/autoqtl_builtins.py @@ -1,6 +1,7 @@ from tpot2.builtin_modules import genetic_encoders from tpot2.builtin_modules import feature_encoding_frequency_selector import sklearn +import numpy as np def params_FeatureEncodingFrequencySelector(trial, name=None): return { @@ -21,4 +22,3 @@ def make_genetic_encoders_config_dictionary(): genetic_encoders.UnderDominanceEncoder : {}, genetic_encoders.OverDominanceEncoder : {}, } - diff --git a/tpot2/config/classifiers.py b/tpot2/config/classifiers.py index d11dc396..06ed2507 100644 --- a/tpot2/config/classifiers.py +++ b/tpot2/config/classifiers.py @@ -14,9 +14,11 @@ from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB +import numpy as np -def params_LogisticRegression(trial, name=None): + +def params_LogisticRegression(trial, random_state=None, name=None): params = {} params['solver'] = trial.suggest_categorical(name=f'solver_{name}', choices=[f'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']) @@ -40,22 +42,22 @@ def params_LogisticRegression(trial, name=None): 'C': params['C'], 'n_jobs': 1, 'max_iter': 1000, + 'random_state': random_state } return param_grid def params_KNeighborsClassifier(trial, name=None, n_samples=10): return { - #'n_neighbors': trial.suggest_int(f'n_neighbors_{name}', 1, 20 ), #TODO: set as a function of the number of samples - 'n_neighbors': trial.suggest_int(f'n_neighbors_{name}', 1, n_samples, log=True ), #TODO: set as a function of the number of samples + 'n_neighbors': trial.suggest_int(f'n_neighbors_{name}', 1, n_samples, log=True ), 'weights': trial.suggest_categorical(f'weights_{name}', ['uniform', 'distance']), 'p': trial.suggest_int('p', 1, 3), - 'metric': trial.suggest_categorical(f'metric_{name}', ['euclidean', 'minkowski']), + 'metric': str(trial.suggest_categorical(f'metric_{name}', ['euclidean', 'minkowski'])), 'n_jobs': 1, } -def params_DecisionTreeClassifier(trial, name=None): +def params_DecisionTreeClassifier(trial, random_state=None, name=None): return { 'criterion': trial.suggest_categorical(f'criterion_{name}', ['gini', 'entropy']), 'max_depth': trial.suggest_int(f'max_depth_{name}', 1, 11), @@ -65,10 +67,11 @@ def params_DecisionTreeClassifier(trial, name=None): 'min_weight_fraction_leaf': 0.0, 'max_features': trial.suggest_categorical(f'max_features_{name}', [ 'sqrt', 'log2']), 'max_leaf_nodes': None, + 'random_state': random_state } -def params_SVC(trial, name=None): +def params_SVC(trial, random_state=None, name=None): return { 'kernel': trial.suggest_categorical(name=f'kernel_{name}', choices=['poly', 'rbf', 'linear', 'sigmoid']), 'C': trial.suggest_float(f'C_{name}', 1e-4, 25, log=True), @@ -79,16 +82,18 @@ def params_SVC(trial, name=None): 'max_iter': 3000, 'tol': 0.005, 'probability': True, + 'random_state': random_state } -def params_LinearSVC(trial, name=None): +def params_LinearSVC(trial, random_state=None, name=None): + penalty = trial.suggest_categorical(name=f'penalty_{name}', choices=['l1', 'l2']) if penalty == 'l1': loss = 'squared_hinge' else: loss = trial.suggest_categorical(name=f'loss_{name}', choices=['hinge', 'squared_hinge']) - + if loss == 'hinge' and penalty == 'l2': dual = True else: @@ -97,12 +102,13 @@ def params_LinearSVC(trial, name=None): return { 'penalty': penalty, 'loss': loss, - 'dual': dual, + 'dual': dual, 'C': trial.suggest_float(f'C_{name}', 1e-4, 25, log=True), + 'random_state': random_state } -def params_RandomForestClassifier(trial, name=None): +def params_RandomForestClassifier(trial, random_state=None, name=None): params = { 'n_estimators': 100, 'criterion': trial.suggest_categorical(name=f'criterion_{name}', choices=['gini', 'entropy']), @@ -111,16 +117,18 @@ def params_RandomForestClassifier(trial, name=None): 'min_samples_split': trial.suggest_int(f'min_samples_split_{name}', 2, 20), 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 20), 'n_jobs': 1, + 'random_state': random_state } return params -def params_GradientBoostingClassifier(trial,n_classes=None, name=None): +def params_GradientBoostingClassifier(trial, random_state=None, n_classes=None, name=None): + if n_classes is not None and n_classes > 2: loss = 'log_loss' else: loss = trial.suggest_categorical(name=f'loss_{name}', choices=['log_loss', 'exponential']) - + params = { 'n_estimators': 100, 'loss': loss, @@ -131,11 +139,12 @@ def params_GradientBoostingClassifier(trial,n_classes=None, name=None): 'max_features': trial.suggest_float(f'max_features_{name}', 0.1, 1.0), 'max_depth': trial.suggest_int(f'max_depth_{name}', 1, 10), 'tol': 1e-4, + 'random_state': random_state } return params -def params_XGBClassifier(trial, name=None): +def params_XGBClassifier(trial, random_state=None, name=None): return { 'learning_rate': trial.suggest_float(f'learning_rate_{name}', 1e-3, 1, log=True), 'subsample': trial.suggest_float(f'subsample_{name}', 0.1, 1.0), @@ -145,10 +154,11 @@ def params_XGBClassifier(trial, name=None): 'max_depth': trial.suggest_int(f'max_depth_{name}', 1, 11), 'n_jobs': 1, #'use_label_encoder' : True, + 'random_state': random_state } -def params_LGBMClassifier(trial, name=None): +def params_LGBMClassifier(trial, random_state=None, name=None): params = { 'objective': 'binary', 'metric': 'binary_logloss', @@ -159,13 +169,15 @@ def params_LGBMClassifier(trial, name=None): 'deterministic': True, 'force_row_wise': True, 'n_jobs': 1, + 'random_state': random_state + } if 2 ** params['max_depth'] > params['num_leaves']: params['num_leaves'] = 2 ** params['max_depth'] return params -def params_ExtraTreesClassifier(trial, name=None): +def params_ExtraTreesClassifier(trial, random_state=None, name=None): params = { 'n_estimators': 100, 'criterion': trial.suggest_categorical(name=f'criterion_{name}', choices=["gini", "entropy"]), @@ -174,10 +186,11 @@ def params_ExtraTreesClassifier(trial, name=None): 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 21, step=1), 'bootstrap': trial.suggest_categorical(f'bootstrap_{name}', [True, False]), 'n_jobs': 1, + 'random_state': random_state } return params -def params_SGDClassifier(trial, name=None): +def params_SGDClassifier(trial, random_state=None, name=None): params = { 'loss': trial.suggest_categorical(f'loss_{name}', ['log_loss', 'modified_huber',]), 'penalty': 'elasticnet', @@ -188,20 +201,21 @@ def params_SGDClassifier(trial, name=None): 'eta0': trial.suggest_float(f'eta0_{name}', 0.01, 1.0), 'power_t': trial.suggest_float(f'power_t_{name}', 1e-5, 100.0, log=True), 'n_jobs': 1, + 'random_state': random_state } return params -def params_MLPClassifier_tpot(trial, name=None): +def params_MLPClassifier_tpot(trial, random_state=None, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 1e-4, 1e-1, log=True), - 'learning_rate_init': trial.suggest_float(f'learning_rate_init_{name}', 1e-3, 1., log=True) + 'learning_rate_init': trial.suggest_float(f'learning_rate_init_{name}', 1e-3, 1., log=True), + 'random_state': random_state } return params def params_MLPClassifier_large(trial, name=None): - n_layers = trial.suggest_int(f'n_layers_{name}', 2, 3) layers = [] for i in range(n_layers): @@ -215,7 +229,8 @@ def params_MLPClassifier_large(trial, name=None): 'max_iter' : 10000 } - return params + return params + def params_BernoulliNB(trial, name=None): params = { @@ -233,24 +248,23 @@ def params_MultinomialNB(trial, name=None): return params -def make_classifier_config_dictionary(n_samples=10, n_classes=None): +def make_classifier_config_dictionary(random_state=None, n_samples=10, n_classes=None): n_samples = min(n_samples,100) #TODO optimize this return { - LogisticRegression: params_LogisticRegression, - DecisionTreeClassifier: params_DecisionTreeClassifier, + LogisticRegression: partial(params_LogisticRegression, random_state=random_state), + DecisionTreeClassifier: partial(params_DecisionTreeClassifier, random_state=random_state), KNeighborsClassifier: partial(params_KNeighborsClassifier,n_samples=n_samples), - GradientBoostingClassifier: partial(params_GradientBoostingClassifier, n_classes=n_classes), - ExtraTreesClassifier:params_ExtraTreesClassifier, - RandomForestClassifier: params_RandomForestClassifier, - SGDClassifier:params_SGDClassifier, + GradientBoostingClassifier: partial(params_GradientBoostingClassifier, random_state=random_state, n_classes=n_classes), + ExtraTreesClassifier: partial(params_ExtraTreesClassifier, random_state=random_state), + RandomForestClassifier: partial(params_RandomForestClassifier, random_state=random_state), + SGDClassifier: partial(params_SGDClassifier, random_state=random_state), GaussianNB: {}, BernoulliNB: params_BernoulliNB, MultinomialNB: params_MultinomialNB, - XGBClassifier: params_XGBClassifier, - #LinearSVC: params_LinearSVC, - SVC: params_SVC, + XGBClassifier: partial(params_XGBClassifier, random_state=random_state), + #LinearSVC: partial(params_LinearSVC, random_state=random_state), + SVC: partial(params_SVC, random_state=random_state), #: params_LGBMClassifier, # logistic regression and SVM/SVC are just special cases of this one? remove? - MLPClassifier: params_MLPClassifier_tpot, + MLPClassifier: partial(params_MLPClassifier_tpot, random_state=random_state), } - diff --git a/tpot2/config/classifiers_sklearnex.py b/tpot2/config/classifiers_sklearnex.py index 7d4129d0..16983332 100644 --- a/tpot2/config/classifiers_sklearnex.py +++ b/tpot2/config/classifiers_sklearnex.py @@ -4,14 +4,18 @@ from sklearnex.svm import NuSVC from sklearnex.linear_model import LogisticRegression +import numpy as np -def params_RandomForestClassifier(trial, name=None): +from functools import partial + +def params_RandomForestClassifier(trial, random_state=None, name=None): return { 'n_estimators': 100, 'bootstrap': trial.suggest_categorical(name=f'bootstrap_{name}', choices=[True, False]), 'min_samples_split': trial.suggest_int(f'min_samples_split_{name}', 2, 20), 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 20), 'n_jobs': 1, + 'random_state': random_state } def params_KNeighborsClassifier(trial, name=None, n_samples=10): @@ -21,7 +25,7 @@ def params_KNeighborsClassifier(trial, name=None, n_samples=10): 'weights': trial.suggest_categorical(f'weights_{name}', ['uniform', 'distance']), } -def params_LogisticRegression(trial, name=None): +def params_LogisticRegression(trial, random_state=None, name=None): params = {} params['dual'] = False params['penalty'] = 'l2' @@ -38,9 +42,10 @@ def params_LogisticRegression(trial, name=None): 'dual': params['dual'], 'C': trial.suggest_float(f'C_{name}', 1e-4, 1e4, log=True), 'max_iter': 1000, + 'random_state': random_state } -def params_SVC(trial, name=None): +def params_SVC(trial, random_state=None, name=None): return { 'kernel': trial.suggest_categorical(name=f'kernel_{name}', choices=['poly', 'rbf', 'linear', 'sigmoid']), 'C': trial.suggest_float(f'C_{name}', 1e-4, 25, log=True), @@ -49,9 +54,10 @@ def params_SVC(trial, name=None): 'max_iter': 3000, 'tol': 0.005, 'probability': True, + 'random_state': random_state } -def params_NuSVC(trial, name=None): +def params_NuSVC(trial, random_state=None, name=None): return { 'nu': trial.suggest_float(f'subsample_{name}', 0.05, 1.0), 'kernel': trial.suggest_categorical(name=f'kernel_{name}', choices=['poly', 'rbf', 'linear', 'sigmoid']), @@ -61,13 +67,14 @@ def params_NuSVC(trial, name=None): 'max_iter': 3000, 'tol': 0.005, 'probability': True, + 'random_state': random_state } -def make_sklearnex_classifier_config_dictionary(n_samples=10, n_classes=None): +def make_sklearnex_classifier_config_dictionary(random_state=None, n_samples=10, n_classes=None): return { - RandomForestClassifier: params_RandomForestClassifier, - KNeighborsClassifier: params_KNeighborsClassifier, - LogisticRegression: params_LogisticRegression, - SVC: params_SVC, - NuSVC: params_NuSVC, - } + RandomForestClassifier: partial(params_RandomForestClassifier, random_state=random_state), + KNeighborsClassifier: partial(params_KNeighborsClassifier, n_samples=n_samples), + LogisticRegression: partial(params_LogisticRegression, random_state=random_state), + SVC: partial(params_SVC, random_state=random_state), + NuSVC: partial(params_NuSVC, random_state=random_state), + } \ No newline at end of file diff --git a/tpot2/config/hyperparametersuggestor.py b/tpot2/config/hyperparametersuggestor.py index 73c9c678..1d3ad1f0 100644 --- a/tpot2/config/hyperparametersuggestor.py +++ b/tpot2/config/hyperparametersuggestor.py @@ -1,19 +1,23 @@ -import random -from scipy.stats import loguniform, logser #TODO: remove this dependency? -import numpy as np #TODO: remove this dependency and use scipy instead? +# import random +# from scipy.stats import loguniform, logser #TODO: remove this dependency? +import numpy as np #function that selects selects items from a list with each having independent probability p of being selected -def select(items, p): - selected = [item for item in items if random.random() < p] +def select(items, p, rng_=None): + rng = np.random.default_rng(rng_) + + selected = [item for item in items if rng.random() < p] #if selected is empty, select one item at random if not selected: - return [random.choice(items)] + return [rng.choice(items)] return selected class Trial(): - def __init__(self, old_params=None, alpha=1, hyperparameter_probability=1): + def __init__(self, rng_=None, old_params=None, alpha=1, hyperparameter_probability=1): + self.rng = np.random.default_rng(rng_) + self._params = dict() self.old_params = old_params @@ -21,7 +25,7 @@ def __init__(self, old_params=None, alpha=1, hyperparameter_probability=1): self.hyperparameter_probability = hyperparameter_probability if old_params is not None and len(old_params) > 0: - self.params_to_update = select(list(old_params.keys()), self.hyperparameter_probability) + self.params_to_update = select(list(old_params.keys()), self.hyperparameter_probability, rng_=self.rng) else: self.params_to_update = None @@ -35,11 +39,11 @@ def suggest_categorical(self, name, choices): choice = self.old_params[name] if choice not in choices: #if the old value is not in the choices, then we need to choose a value for it choice = self.suggest_categorical_(name, choices) - + self._params[name] = choice return choice - def suggest_float(self, + def suggest_float(self, name: str, low: float, high: float, @@ -94,18 +98,18 @@ def suggest_uniform(self, name, low, high): self._params[name] = choice return choice - + #################################### #Replicating the API found in optuna: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html #copy-pasted some code def suggest_categorical_(self, name, choices): - - choice = random.choice(choices) + + choice = self.rng.choice(choices) return choice - def suggest_float_(self, + def suggest_float_(self, name: str, low: float, high: float, @@ -113,7 +117,7 @@ def suggest_float_(self, step = None, log = False, ): - + if log and step is not None: raise ValueError("The parameter `step` is not supported when `log` is true.") @@ -136,16 +140,16 @@ def suggest_float_(self, #TODO check this produces correct output if log: - value = np.random.uniform(np.log(low),np.log(high)) + value = self.rng.uniform(np.log(low),np.log(high)) choice = np.e**value return choice else: if step is not None: - choice = np.random.choice(np.arange(low,high,step)) + choice = self.rng.choice(np.arange(low,high,step)) return choice else: - choice = np.random.uniform(low,high) + choice = self.rng.uniform(low,high) return choice @@ -157,7 +161,7 @@ def suggest_discrete_uniform_(self, name, low, high, q): def suggest_int_(self, name, low, high, step=1, log=False): if low == high: #TODO check that this matches optuna's behaviour return low - + if log and step >1: raise ValueError("The parameter `step`>1 is not supported when `log` is true.") @@ -179,11 +183,11 @@ def suggest_int_(self, name, low, high, step=1, log=False): ) if log: - value = np.random.uniform(np.log(low),np.log(high)) + value = self.rng.uniform(np.log(low),np.log(high)) choice = int(np.e**value) return choice else: - choice = np.random.choice(list(range(low,high,step))) + choice = self.rng.choice(list(range(low,high,step))) return choice def suggest_uniform_(self, name, low, high): diff --git a/tpot2/config/mdr_configs.py b/tpot2/config/mdr_configs.py index aff4ee87..1fe7cc7a 100644 --- a/tpot2/config/mdr_configs.py +++ b/tpot2/config/mdr_configs.py @@ -57,5 +57,4 @@ def make_MDR_config_dictionary(): def make_ContinuousMDR_config_dictionary(): return { ContinuousMDR : params_ContinuousMDR - } - + } \ No newline at end of file diff --git a/tpot2/config/regressors.py b/tpot2/config/regressors.py index 930e0e7e..ad7aa182 100644 --- a/tpot2/config/regressors.py +++ b/tpot2/config/regressors.py @@ -18,8 +18,6 @@ from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import ElasticNetCV - - from xgboost import XGBRegressor from functools import partial @@ -29,18 +27,19 @@ #TODO: fill in remaining #TODO check for places were we could use log scaling -def params_RandomForestRegressor(trial, name=None): +def params_RandomForestRegressor(trial, random_state=None, name=None): return { 'n_estimators': 100, 'max_features': trial.suggest_float(f'max_features_{name}', 0.05, 1.0), 'bootstrap': trial.suggest_categorical(name=f'bootstrap_{name}', choices=[True, False]), 'min_samples_split': trial.suggest_int(f'min_samples_split_{name}', 2, 21), 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 21), + 'random_state': random_state } # SGDRegressor parameters -def params_SGDRegressor(trial, name=None): +def params_SGDRegressor(trial, random_state=None, name=None): params = { 'loss': trial.suggest_categorical(f'loss_{name}', ['huber', 'squared_error', 'epsilon_insensitive', 'squared_epsilon_insensitive']), 'penalty': 'elasticnet', @@ -49,13 +48,14 @@ def params_SGDRegressor(trial, name=None): 'fit_intercept':True, 'l1_ratio': trial.suggest_float(f'l1_ratio_{name}', 0.0, 1.0), 'eta0': trial.suggest_float(f'eta0_{name}', 0.01, 1.0), - 'power_t': trial.suggest_float(f'power_t_{name}', 1e-5, 100.0, log=True) + 'power_t': trial.suggest_float(f'power_t_{name}', 1e-5, 100.0, log=True), + 'random_state': random_state } return params # Ridge parameters -def params_Ridge(trial, name=None): +def params_Ridge(trial, random_state=None, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), 'fit_intercept': True, @@ -64,12 +64,13 @@ def params_Ridge(trial, name=None): #'max_iter': trial.suggest_int(f'max_iter_{name}', 100, 1000), 'tol': trial.suggest_float(f'tol_{name}', 1e-5, 1e-1, log=True), 'solver': trial.suggest_categorical(f'solver_{name}', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']), + 'random_state': random_state } return params # Lasso parameters -def params_Lasso(trial, name=None): +def params_Lasso(trial, random_state=None, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), 'fit_intercept': True, @@ -81,30 +82,33 @@ def params_Lasso(trial, name=None): 'positive': trial.suggest_categorical(f'positive_{name}', [True, False]), 'selection': trial.suggest_categorical(f'selection_{name}', ['cyclic', 'random']), + 'random_state': random_state } return params # ElasticNet parameters -def params_ElasticNet(trial, name=None): +def params_ElasticNet(trial, random_state=None, name=None): params = { 'alpha': 1 - trial.suggest_float(f'alpha_{name}', 0.0, 1.0, log=True), 'l1_ratio': 1- trial.suggest_float(f'l1_ratio_{name}',0.0, 1.0), + 'random_state': random_state } return params # Lars parameters -def params_Lars(trial, name=None): +def params_Lars(trial, random_state=None, name=None): params = { 'fit_intercept': True, 'verbose': trial.suggest_categorical(f'verbose_{name}', [True, False]), 'normalize': trial.suggest_categorical(f'normalize_{name}', [True, False]), - + # 'precompute': trial.suggest_categorical(f'precompute_{name}', ['auto_{name}', True, False]), 'n_nonzero_coefs': trial.suggest_int(f'n_nonzero_coefs_{name}', 1, 100), 'eps': trial.suggest_float(f'eps_{name}', 1e-5, 1e-1, log=True), 'copy_X': trial.suggest_categorical(f'copy_X_{name}', [True, False]), 'fit_path': trial.suggest_categorical(f'fit_path_{name}', [True, False]), # 'positive': trial.suggest_categorical(f'positive_{name}', [True, False]), + 'random_state': random_state } return params @@ -136,7 +140,7 @@ def params_BayesianRidge(trial, name=None): return params # LassoLars parameters -def params_LassoLars(trial, name=None): +def params_LassoLars(trial, random_state=None, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), # 'fit_intercept': True, @@ -146,24 +150,27 @@ def params_LassoLars(trial, name=None): 'eps': trial.suggest_float(f'eps_{name}', 1e-5, 1e-1, log=True), # 'copy_X': trial.suggest_categorical(f'copy_X_{name}', [True, False]), # 'positive': trial.suggest_categorical(f'positive_{name}', [True, False]), + 'random_state': random_state } return params # LassoLars parameters -def params_LassoLarsCV(trial, name=None): +def params_LassoLarsCV(trial, cv, name=None): params = { 'normalize': trial.suggest_categorical(f'normalize_{name}', [True, False]), + 'cv': cv, } return params # BaggingRegressor parameters -def params_BaggingRegressor(trial, name=None): +def params_BaggingRegressor(trial, random_state=None, name=None): params = { 'n_estimators': trial.suggest_int(f'n_estimators_{name}', 10, 100), 'max_samples': trial.suggest_float(f'max_samples_{name}', 0.05, 1.00), 'max_features': trial.suggest_float(f'max_features_{name}', 0.05, 1.00), 'bootstrap': trial.suggest_categorical(f'bootstrap_{name}', [True, False]), 'bootstrap_features': trial.suggest_categorical(f'bootstrap_features_{name}', [True, False]), + 'random_state': random_state } return params @@ -187,13 +194,14 @@ def params_ARDRegression(trial, name=None): # TheilSenRegressor parameters -def params_TheilSenRegressor(trial, name=None): +def params_TheilSenRegressor(trial, random_state=None, name=None): params = { 'n_subsamples': trial.suggest_int(f'n_subsamples_{name}', 10, 100), 'max_subpopulation': trial.suggest_int(f'max_subpopulation_{name}', 100, 1000), 'fit_intercept': True, 'copy_X': trial.suggest_categorical(f'copy_X_{name}', [True, False]), 'verbose': trial.suggest_categorical(f'verbose_{name}', [True, False]), + 'random_state': random_state } return params @@ -208,9 +216,9 @@ def params_SVR(trial, name=None): 'tol': 0.005, } return params - + # Perceptron parameters -def params_Perceptron(trial, name=None): +def params_Perceptron(trial, random_state=None, name=None): params = { 'penalty': trial.suggest_categorical(f'penalty_{name}', [None, 'l2', 'l1', 'elasticnet']), 'alpha': trial.suggest_float(f'alpha_{name}', 1e-5, 1e-1, log=True), @@ -228,20 +236,22 @@ def params_Perceptron(trial, name=None): 'class_weight': trial.suggest_categorical(f'class_weight_{name}', [None, 'balanced']), 'warm_start': trial.suggest_categorical(f'warm_start_{name}', [True, False]), 'average': trial.suggest_categorical(f'average_{name}', [True, False]), + 'random_state': random_state } return params -def params_MLPRegressor(trial, name=None): +def params_MLPRegressor(trial, random_state=None, name=None): params = { 'alpha': trial.suggest_float(f'alpha_{name}', 1e-4, 1e-1, log=True), - 'learning_rate_init': trial.suggest_float(f'learning_rate_init_{name}', 1e-3, 1., log=True) + 'learning_rate_init': trial.suggest_float(f'learning_rate_init_{name}', 1e-3, 1., log=True), + 'random_state': random_state } return params #GradientBoostingRegressor parameters -def params_GradientBoostingRegressor(trial, name=None): +def params_GradientBoostingRegressor(trial, random_state=None, name=None): loss = trial.suggest_categorical(f'loss_{name}', ['ls', 'lad', 'huber', 'quantile']) params = { @@ -254,6 +264,7 @@ def params_GradientBoostingRegressor(trial, name=None): 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 21), 'subsample': 1-trial.suggest_float(f'subsample_{name}', 0.05, 1.00, log=True), 'max_features': 1-trial.suggest_float(f'max_features_{name}', 0.05, 1.00, log=True), + 'random_state': random_state } @@ -265,7 +276,7 @@ def params_GradientBoostingRegressor(trial, name=None): -def params_DecisionTreeRegressor(trial, name=None): +def params_DecisionTreeRegressor(trial, random_state=None, name=None): params = { 'max_depth': trial.suggest_int(f'max_depth_{name}', 1,11), 'min_samples_split': trial.suggest_int(f'min_samples_split_{name}', 2, 21), @@ -274,13 +285,14 @@ def params_DecisionTreeRegressor(trial, name=None): # 'splitter': trial.suggest_categorical(f'splitter_{name}', ['best', 'random']), #'max_features': trial.suggest_categorical(f'max_features_{name}', [None, 'auto', 'sqrt', 'log2']), #'ccp_alpha': trial.suggest_float(f'ccp_alpha_{name}', 1e-1, 10.0), - + 'random_state': random_state + } return params def params_KNeighborsRegressor(trial, name=None, n_samples=100): params = { - 'n_neighbors': trial.suggest_int(f'n_neighbors_{name}', 1, 100), + 'n_neighbors': trial.suggest_int(f'n_neighbors_{name}', 1, n_samples, log=True ), 'weights': trial.suggest_categorical(f'weights_{name}', ['uniform', 'distance']), 'p': trial.suggest_int(f'p_{name}', 1, 3), 'metric': trial.suggest_categorical(f'metric_{name}', ['minkowski', 'euclidean', 'manhattan']), @@ -288,19 +300,20 @@ def params_KNeighborsRegressor(trial, name=None, n_samples=100): } return params -def params_LinearSVR(trial, name=None): +def params_LinearSVR(trial, random_state=None, name=None): params = { 'epsilon': trial.suggest_float(f'epsilon_{name}', 1e-4, 1.0, log=True), 'C': trial.suggest_float(f'C_{name}', 1e-4,25.0, log=True), 'dual': trial.suggest_categorical(f'dual_{name}', [True,False]), 'loss': trial.suggest_categorical(f'loss_{name}', ['epsilon_insensitive', 'squared_epsilon_insensitive']), + 'random_state': random_state } return params # XGBRegressor parameters -def params_XGBRegressor(trial, name=None): +def params_XGBRegressor(trial, random_state=None, name=None): return { 'learning_rate': trial.suggest_float(f'learning_rate_{name}', 1e-3, 1, log=True), 'subsample': trial.suggest_float(f'subsample_{name}', 0.05, 1.0), @@ -311,20 +324,22 @@ def params_XGBRegressor(trial, name=None): 'nthread': 1, 'verbosity': 0, 'objective': 'reg:squarederror', + 'random_state': random_state } -def params_AdaBoostRegressor(trial, name=None): +def params_AdaBoostRegressor(trial, random_state=None, name=None): params = { 'n_estimators': 100, 'learning_rate': trial.suggest_float(f'learning_rate_{name}', 1e-3, 1.0, log=True), 'loss': trial.suggest_categorical(f'loss_{name}', ['linear', 'square', 'exponential']), + 'random_state': random_state } return params # ExtraTreesRegressor parameters -def params_ExtraTreesRegressor(trial, name=None): +def params_ExtraTreesRegressor(trial, random_state=None, name=None): params = { 'n_estimators': 100, 'max_features': trial.suggest_float(f'max_features_{name}', 0.05, 1.0), @@ -333,7 +348,7 @@ def params_ExtraTreesRegressor(trial, name=None): 'bootstrap': trial.suggest_categorical(f'bootstrap_{name}', [True, False]), #'criterion': trial.suggest_categorical(f'criterion_{name}', ['squared_error', 'poisson', 'absolute_error', 'friedman_mse']), - + #'max_depth': trial.suggest_int(f'max_depth_{name}', 1, 10), #'min_weight_fraction_leaf': trial.suggest_float(f'min_weight_fraction_leaf_{name}', 0.0, 0.5), @@ -341,41 +356,41 @@ def params_ExtraTreesRegressor(trial, name=None): #'max_leaf_nodes': trial.suggest_int(f'max_leaf_nodes_{name}', 2, 100), #'min_impurity_decrease': trial.suggest_float(f'min_impurity_decrease_{name}', 1e-5, 1e-1, log=True), # 'min_impurity_split': trial.suggest_float(f'min_impurity_split_{name}', 1e-5, 1e-1, log=True), - + #if bootstrap is True #'oob_score': trial.suggest_categorical(f'oob_score_{name}', [True, False]), - + #'ccp_alpha': trial.suggest_float(f'ccp_alpha_{name}', 1e-5, 1e-1, log=True), # 'max_samples': trial.suggest_float(f'max_samples_{name}', 0.05, 1.00), + + 'random_state': random_state } return params -def make_regressor_config_dictionary(n_samples=10): +def make_regressor_config_dictionary(random_state=None, cv=None, n_samples=10): n_samples = min(n_samples,100) #TODO optimize this - + regressor_config_dictionary = { #ElasticNet: params_ElasticNet, ElasticNetCV: { 'l1_ratio': [.1, .5, .7, .9, .95, .99, 1], - 'cv': 5, + 'cv': cv, }, - ExtraTreesRegressor: params_ExtraTreesRegressor, - GradientBoostingRegressor: params_GradientBoostingRegressor, - AdaBoostRegressor: params_AdaBoostRegressor, - DecisionTreeRegressor: params_DecisionTreeRegressor, + ExtraTreesRegressor: partial(params_ExtraTreesRegressor, random_state=random_state), + GradientBoostingRegressor: partial(params_GradientBoostingRegressor, random_state=random_state), + AdaBoostRegressor: partial(params_AdaBoostRegressor, random_state=random_state), + DecisionTreeRegressor: partial(params_DecisionTreeRegressor, random_state=random_state), KNeighborsRegressor: partial(params_KNeighborsRegressor,n_samples=n_samples), - LassoLarsCV: params_LassoLarsCV, + LassoLarsCV: partial(params_LassoLarsCV, cv=cv), SVR: params_SVR, - RandomForestRegressor: params_RandomForestRegressor, - RidgeCV: {}, - XGBRegressor: params_XGBRegressor, - SGDRegressor: params_SGDRegressor, + RandomForestRegressor: partial(params_RandomForestRegressor, random_state=random_state), + RidgeCV: {'cv': cv}, + XGBRegressor: partial(params_XGBRegressor, random_state=random_state), + SGDRegressor: partial(params_SGDRegressor, random_state= random_state), } - - return regressor_config_dictionary - + return regressor_config_dictionary \ No newline at end of file diff --git a/tpot2/config/regressors_sklearnex.py b/tpot2/config/regressors_sklearnex.py index 4eb10f1c..279d2dba 100644 --- a/tpot2/config/regressors_sklearnex.py +++ b/tpot2/config/regressors_sklearnex.py @@ -9,14 +9,19 @@ from sklearnex.ensemble import RandomForestRegressor from sklearnex.neighbors import KNeighborsRegressor +import numpy as np -def params_RandomForestRegressor(trial, name=None): +from functools import partial + + +def params_RandomForestRegressor(trial, random_state=None, name=None): return { 'n_estimators': 100, 'max_features': trial.suggest_float(f'max_features_{name}', 0.05, 1.0), 'bootstrap': trial.suggest_categorical(name=f'bootstrap_{name}', choices=[True, False]), 'min_samples_split': trial.suggest_int(f'min_samples_split_{name}', 2, 21), 'min_samples_leaf': trial.suggest_int(f'min_samples_leaf_{name}', 1, 21), + 'random_state': random_state } def params_KNeighborsRegressor(trial, name=None, n_samples=100): @@ -29,14 +34,15 @@ def params_KNeighborsRegressor(trial, name=None, n_samples=100): def params_LinearRegression(trial, name=None): return {} -def params_Ridge(trial, name=None): +def params_Ridge(trial, random_state=None, name=None): return { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), 'fit_intercept': True, 'tol': trial.suggest_float(f'tol_{name}', 1e-5, 1e-1, log=True), + 'random_state': random_state } -def params_Lasso(trial, name=None): +def params_Lasso(trial, random_state=None, name=None): return { 'alpha': trial.suggest_float(f'alpha_{name}', 0.0, 1.0), 'fit_intercept': True, @@ -44,22 +50,26 @@ def params_Lasso(trial, name=None): 'tol': trial.suggest_float(f'tol_{name}', 1e-5, 1e-1, log=True), 'positive': trial.suggest_categorical(f'positive_{name}', [True, False]), 'selection': trial.suggest_categorical(f'selection_{name}', ['cyclic', 'random']), + 'random_state': random_state } -def params_ElasticNet(trial, name=None): - return { - 'alpha': 1 - trial.suggest_float(f'alpha_{name}', 0.0, 1.0), +def params_ElasticNet(trial, random_state=None, name=None): + params = { + 'alpha': 1 - trial.suggest_float(f'alpha_{name}', 0.0, 1.0, log=True), 'l1_ratio': 1- trial.suggest_float(f'l1_ratio_{name}',0.0, 1.0), + 'random_state': random_state } + return params def params_SVR(trial, name=None): - return { + params = { 'kernel': trial.suggest_categorical(name=f'kernel_{name}', choices=['poly', 'rbf', 'linear', 'sigmoid']), 'C': trial.suggest_float(f'C_{name}', 1e-4, 25, log=True), 'degree': trial.suggest_int(f'degree_{name}', 1, 4), 'max_iter': 3000, 'tol': 0.005, } + return params def params_NuSVR(trial, name=None): return { @@ -71,14 +81,14 @@ def params_NuSVR(trial, name=None): 'tol': 0.005, } -def make_sklearnex_regressor_config_dictionary(n_samples=10): +def make_sklearnex_regressor_config_dictionary(random_state=None, n_samples=10): return { - RandomForestRegressor: params_RandomForestRegressor, + RandomForestRegressor: partial(params_RandomForestRegressor, random_state=random_state), KNeighborsRegressor: params_KNeighborsRegressor, LinearRegression: params_LinearRegression, - Ridge: params_Ridge, - Lasso: params_Lasso, - ElasticNet: params_ElasticNet, + Ridge: partial(params_Ridge, random_state=random_state), + Lasso: partial(params_Lasso, random_state=random_state), + ElasticNet: partial(params_ElasticNet, random_state=random_state), SVR: params_SVR, NuSVR: params_NuSVR, } diff --git a/tpot2/config/selectors.py b/tpot2/config/selectors.py index 12c3e3c1..42589d83 100644 --- a/tpot2/config/selectors.py +++ b/tpot2/config/selectors.py @@ -1,4 +1,4 @@ -#TODO: how to best support transformers/selectors that take other transformers with their own hyperparameters? +#TODO: how to best support transformers/selectors that take other transformers with their own hyperparameters? import numpy as np from sklearn.feature_selection import SelectFwe from sklearn.feature_selection import SelectPercentile @@ -29,15 +29,16 @@ def params_sklearn_feature_selection_VarianceThreshold(trial, name=None): return { 'threshold': trial.suggest_float(f'threshold_{name}', 1e-4, .2, log=True) } - + #TODO add more estimator options? How will that interact with optuna? -def params_sklearn_feature_selection_RFE(trial, name=None, classifier=True): +def params_sklearn_feature_selection_RFE(trial, random_state=None, name=None, classifier=True): + if classifier: - estimator = ExtraTreesClassifier(**params_ExtraTreesClassifier(trial, name=f"RFE_{name}")) + estimator = ExtraTreesClassifier(**params_ExtraTreesClassifier(trial, random_state=random_state, name=f"RFE_{name}")) else: - estimator = ExtraTreesRegressor(**params_ExtraTreesRegressor(trial, name=f"RFE_{name}")) - + estimator = ExtraTreesRegressor(**params_ExtraTreesRegressor(trial, random_state=random_state, name=f"RFE_{name}")) + params = { 'step': trial.suggest_float(f'step_{name}', 1e-4, 1.0, log=False), 'estimator' : estimator, @@ -46,12 +47,13 @@ def params_sklearn_feature_selection_RFE(trial, name=None, classifier=True): return params -def params_sklearn_feature_selection_SelectFromModel(trial, name=None, classifier=True): +def params_sklearn_feature_selection_SelectFromModel(trial, random_state=None, name=None, classifier=True): + if classifier: - estimator = ExtraTreesClassifier(**params_ExtraTreesClassifier(trial, name=f"SFM_{name}")) + estimator = ExtraTreesClassifier(**params_ExtraTreesClassifier(trial, random_state=random_state, name=f"SFM_{name}")) else: - estimator = ExtraTreesRegressor(**params_ExtraTreesRegressor(trial, name=f"SFM_{name}")) - + estimator = ExtraTreesRegressor(**params_ExtraTreesRegressor(trial, random_state=random_state, name=f"SFM_{name}")) + params = { 'threshold': trial.suggest_float(f'threshold_{name}', 1e-4, 1.0, log=True), 'estimator' : estimator, @@ -61,47 +63,49 @@ def params_sklearn_feature_selection_SelectFromModel(trial, name=None, classifie -def params_sklearn_feature_selection_RFE_wrapped(trial, name=None, classifier=True): +def params_sklearn_feature_selection_RFE_wrapped(trial, random_state=None, name=None, classifier=True): + params = { 'step': trial.suggest_float(f'step_{name}', 1e-4, 1.0, log=False), } - + if classifier: - estimator_params = params_ExtraTreesClassifier(trial, name=f"RFE_{name}") + estimator_params = params_ExtraTreesClassifier(trial, random_state=random_state, name=f"RFE_{name}") else: - estimator_params = params_ExtraTreesRegressor(trial, name=f"RFE_{name}") - + estimator_params = params_ExtraTreesRegressor(trial, random_state=random_state, name=f"RFE_{name}") + params.update(estimator_params) return params -def params_sklearn_feature_selection_SelectFromModel_wrapped(trial, name=None, classifier=True): +def params_sklearn_feature_selection_SelectFromModel_wrapped(trial, random_state=None, name=None, classifier=True): + params = { 'threshold': trial.suggest_float(f'threshold_{name}', 1e-4, 1.0, log=True), } - + if classifier: - estimator_params = params_ExtraTreesClassifier(trial, name=f"SFM_{name}") + estimator_params = params_ExtraTreesClassifier(trial, random_state=random_state, name=f"SFM_{name}") else: - estimator_params = params_ExtraTreesRegressor(trial, name=f"SFM_{name}") - + estimator_params = params_ExtraTreesRegressor(trial, random_state=random_state, name=f"SFM_{name}") + params.update(estimator_params) return params -def make_selector_config_dictionary(classifier=True): +def make_selector_config_dictionary(random_state=None, classifier=True): if classifier: - params = {RFE_ExtraTreesClassifier : partial(params_sklearn_feature_selection_RFE_wrapped, classifier=classifier), - SelectFromModel_ExtraTreesClassifier : partial(params_sklearn_feature_selection_SelectFromModel_wrapped, classifier=classifier), + params = {RFE_ExtraTreesClassifier : partial(params_sklearn_feature_selection_RFE_wrapped, random_state=random_state, classifier=classifier), + SelectFromModel_ExtraTreesClassifier : partial(params_sklearn_feature_selection_SelectFromModel_wrapped, random_state=random_state, classifier=classifier), } else: - params = {RFE_ExtraTreesRegressor : partial(params_sklearn_feature_selection_RFE_wrapped, classifier=classifier), - SelectFromModel_ExtraTreesRegressor : partial(params_sklearn_feature_selection_SelectFromModel_wrapped, classifier=classifier), + params = {RFE_ExtraTreesRegressor : partial(params_sklearn_feature_selection_RFE_wrapped, random_state=random_state, classifier=classifier), + SelectFromModel_ExtraTreesRegressor : partial(params_sklearn_feature_selection_SelectFromModel_wrapped, random_state=random_state, classifier=classifier), } - + params.update({ SelectFwe: params_sklearn_feature_selection_SelectFwe, SelectPercentile: params_sklearn_feature_selection_SelectPercentile, VarianceThreshold: params_sklearn_feature_selection_VarianceThreshold,}) diff --git a/tpot2/config/special_configs.py b/tpot2/config/special_configs.py index 0f1e1e85..a6745b6f 100644 --- a/tpot2/config/special_configs.py +++ b/tpot2/config/special_configs.py @@ -11,9 +11,9 @@ def params_arthmetic_operator(trial, name=None): } def make_arithmetic_transformer_config_dictionary(): - return { + return { AddTransformer: {}, - mul_neg_1_Transformer: {}, + mul_neg_1_Transformer: {}, MulTransformer: {}, SafeReciprocalTransformer: {}, EQTransformer: {}, @@ -22,7 +22,7 @@ def make_arithmetic_transformer_config_dictionary(): GTTransformer: {}, LETransformer: {}, LTTransformer: {}, - MinTransformer: {}, + MinTransformer: {}, MaxTransformer: {}, } @@ -65,10 +65,10 @@ def make_FSS_config_dictionary(subsets=None, n_features=None, feature_names=None Parameters ---------- subsets: Sets the subsets to select from. - - str : If a string, it is assumed to be a path to a csv file with the subsets. + - str : If a string, it is assumed to be a path to a csv file with the subsets. The first column is assumed to be the name of the subset and the remaining columns are the features in the subset. - list or np.ndarray : If a list or np.ndarray, it is assumed to be a list of subsets. - + n_features: int the number of features in the dataset. If subsets is None, each column will be treated as a subset. One column will be selected per subset. """ @@ -76,10 +76,10 @@ def make_FSS_config_dictionary(subsets=None, n_features=None, feature_names=None #require at least of of the parameters if subsets is None and n_features is None: raise ValueError('At least one of the parameters must be provided') - + if isinstance(subsets, str): df = pd.read_csv(subsets,header=None,index_col=0) - df['features'] = df.apply(lambda x: list([x[c] for c in df.columns]),axis=1) + df['features'] = df.apply(lambda x: list([x[c] for c in df.columns]),axis=1) subset_dict = {} for row in df.index: subset_dict[row] = df.loc[row]['features'] diff --git a/tpot2/config/transformers.py b/tpot2/config/transformers.py index 6608c6eb..ab17b3c8 100644 --- a/tpot2/config/transformers.py +++ b/tpot2/config/transformers.py @@ -1,4 +1,6 @@ from functools import partial +import numpy as np + from tpot2.builtin_modules import ZeroCount, OneHotEncoder from sklearn.preprocessing import Binarizer from sklearn.decomposition import FastICA @@ -19,14 +21,16 @@ def params_sklearn_preprocessing_Binarizer(trial, name=None): 'threshold': trial.suggest_float(f'threshold_{name}', 0.0, 1.0), } -def params_sklearn_decomposition_FastICA(trial, name=None, n_features=100): +def params_sklearn_decomposition_FastICA(trial, random_state=None, name=None, n_features=100): return { + 'n_components': trial.suggest_int(f'n_components_{name}', 1, n_features), # number of components wrt number of features 'algorithm': trial.suggest_categorical(f'algorithm_{name}', ['parallel', 'deflation']), 'whiten':'unit-variance', + 'random_state': random_state } def params_sklearn_cluster_FeatureAgglomeration(trial, name=None, n_features=100): - + linkage = trial.suggest_categorical(f'linkage_{name}', ['ward', 'complete', 'average']) if linkage == 'ward': metric = 'euclidean' @@ -35,74 +39,65 @@ def params_sklearn_cluster_FeatureAgglomeration(trial, name=None, n_features=100 return { 'linkage': linkage, 'metric': metric, - 'n_clusters': trial.suggest_int(f'n_clusters_{name}', 2, 4), #TODO perhaps a percentage of n_features + 'n_clusters': trial.suggest_int(f'n_clusters_{name}', 2, n_features-1), #TODO perhaps a percentage of n_features } - - def params_sklearn_preprocessing_Normalizer(trial, name=None): return { 'norm': trial.suggest_categorical(f'norm_{name}', ['l1', 'l2', 'max']), } -def params_sklearn_kernel_approximation_Nystroem(trial, name=None, n_features=100): +def params_sklearn_kernel_approximation_Nystroem(trial, random_state=None, name=None, n_features=100): return { 'gamma': trial.suggest_float(f'gamma_{name}', 0.0, 1.0), 'kernel': trial.suggest_categorical(f'kernel_{name}', ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2', 'sigmoid']), - 'n_components': trial.suggest_int(f'n_components_{name}', 1, 11), #TODO perhaps a percentage of n_features + 'n_components': trial.suggest_int(f'n_components_{name}', 1, n_features), + 'random_state': random_state } -def params_sklearn_decomposition_PCA(trial, name=None, n_features=100): +def params_sklearn_decomposition_PCA(trial, random_state=None, name=None, n_features=100): # keep the number of components required to explain 'variance_explained' of the variance - variance_explained = 1 - trial.suggest_float(f'n_components_{name}', 0.001, 0.5, log=True) #values closer to 1 are more likely - + variance_explained = 1.0 - trial.suggest_float(f'n_components_{name}', 0.001, 0.5, log=True) #values closer to 1 are more likely + return { 'n_components': variance_explained, + 'random_state': random_state } - -def params_sklearn_kernel_approximation_RBFSampler(trial, name=None, n_features=100): +def params_sklearn_kernel_approximation_RBFSampler(trial, random_state=None, name=None, n_features=100): return { + 'n_components': trial.suggest_int(f'n_components_{name}', 1, n_features), 'gamma': trial.suggest_float(f'gamma_{name}', 0.0, 1.0), + 'random_state': random_state } - - - def params_tpot_builtins_ZeroCount(trial, name=None): return {} - def params_tpot_builtins_OneHotEncoder(trial, name=None): return {} - - - - -def make_transformer_config_dictionary(n_features=10): +def make_transformer_config_dictionary(random_state=None, n_features=10): #n_features = min(n_features,100) #TODO optimize this return { Binarizer: params_sklearn_preprocessing_Binarizer, - FastICA: partial(params_sklearn_decomposition_FastICA,n_features=n_features), + FastICA: partial(params_sklearn_decomposition_FastICA, random_state=random_state, n_features=n_features), FeatureAgglomeration: partial(params_sklearn_cluster_FeatureAgglomeration,n_features=n_features), MaxAbsScaler: {}, MinMaxScaler: {}, Normalizer: params_sklearn_preprocessing_Normalizer, - Nystroem: partial(params_sklearn_kernel_approximation_Nystroem,n_features=n_features), - PCA: partial(params_sklearn_decomposition_PCA,n_features=n_features), + Nystroem: partial(params_sklearn_kernel_approximation_Nystroem, random_state=random_state, n_features=n_features), + PCA: partial(params_sklearn_decomposition_PCA, random_state=random_state, n_features=n_features), PolynomialFeatures: { 'degree': 2, 'include_bias': False, 'interaction_only': False, }, - RBFSampler: partial(params_sklearn_kernel_approximation_RBFSampler,n_features=n_features), + RBFSampler: partial(params_sklearn_kernel_approximation_RBFSampler, random_state=random_state, n_features=n_features), RobustScaler: {}, StandardScaler: {}, ZeroCount: params_tpot_builtins_ZeroCount, OneHotEncoder: params_tpot_builtins_OneHotEncoder, } - - diff --git a/tpot2/evolvers/base_evolver.py b/tpot2/evolvers/base_evolver.py index 7a94cf10..9959f9ab 100644 --- a/tpot2/evolvers/base_evolver.py +++ b/tpot2/evolvers/base_evolver.py @@ -20,16 +20,18 @@ import math from tpot2.utils.utils import get_thresholds, beta_interpolation, remove_items, equalize_list -def ind_mutate(ind): - return ind.mutate() +def ind_mutate(ind, rng_): + rng = np.random.default_rng(rng_) + return ind.mutate(rng_=rng) -def ind_crossover(ind1, ind2): - return ind1.crossover(ind2) +def ind_crossover(ind1, ind2, rng_): + rng = np.random.default_rng(rng_) + return ind1.crossover(ind2, rng_=rng) class BaseEvolver(): - def __init__( self, + def __init__( self, individual_generator , - + objective_functions, objective_function_weights, objective_names = None, @@ -38,53 +40,55 @@ def __init__( self, population_size = 50, initial_population_size = None, - population_scaling = .5, - generations_until_end_population = 1, - generations = 50, + population_scaling = .5, + generations_until_end_population = 1, + generations = 50, early_stop = None, early_stop_tol = 0.001, - - max_time_seconds=float("inf"), + + max_time_seconds=float("inf"), max_eval_time_seconds=60*5, n_jobs=1, memory_limit="4GB", client=None, - + survival_percentage = 1, crossover_probability=.2, mutate_probability=.7, mutate_then_crossover_probability=.05, crossover_then_mutate_probability=.05, - + mutation_functions = [ind_mutate], crossover_functions = [ind_crossover], mutation_function_weights = None, crossover_function_weights = None, - + n_parents=2, survival_selector = survival_select_NSGA2, parent_selector = tournament_selection_dominated, - - budget_range = None, - budget_scaling = .5, - generations_until_end_budget = 1, + + budget_range = None, + budget_scaling = .5, + generations_until_end_budget = 1, stepwise_steps = 5, - - threshold_evaluation_early_stop = None, + + threshold_evaluation_early_stop = None, threshold_evaluation_scaling = .5, min_history_threshold = 20, selection_evaluation_early_stop = None, selection_evaluation_scaling = .5, - evaluation_early_stop_steps = None, + evaluation_early_stop_steps = None, final_score_strategy = "mean", - verbose = 0, + verbose = 0, periodic_checkpoint_folder = None, callback = None, + rng_=None, + ) -> None: """ Uses mutation, crossover, and optimization functions to evolve a population of individuals towards the given objective functions. @@ -95,7 +99,7 @@ def __init__( self, Generator that yields new base individuals. Used to generate initial population. objective_functions : list of callables list of functions that get applied to the individual and return a float or list of floats - If an objective function returns multiple values, they are all concatenated in order + If an objective function returns multiple values, they are all concatenated in order with respect to objective_function_weights and early_stop_tol. objective_function_weights : list of floats list of weights for each objective function. Sign flips whether bigger is better or not @@ -111,8 +115,8 @@ def __init__( self, Size of the initial population. If None, population_size will be used. population_scaling : int, default=0.5 Scaling factor to use when determining how fast we move the threshold moves from the start to end percentile. - generations_until_end_population : int, default=1 - Number of generations until the population size reaches population_size + generations_until_end_population : int, default=1 + Number of generations until the population size reaches population_size generations : int, default=50 Number of generations to run early_stop : int, default=None @@ -121,7 +125,7 @@ def __init__( self, -list of floats list of tolerances for each objective function. If the difference between the best score and the current score is less than the tolerance, the individual is considered to have converged If an index of the list is None, that item will not be used for early stopping - -int + -int If an int is given, it will be used as the tolerance for all objectives max_time_seconds : float, default=float("inf") Maximum time to run the optimization. If none or inf, will run until the end of the generations. @@ -132,9 +136,9 @@ def __init__( self, memory_limit : str, default="4GB" Memory limit for each job. See Dask [LocalCluster documentation](https://distributed.dask.org/en/stable/api.html#distributed.Client) for more information. client : dask.distributed.Client, default=None - A dask client to use for parallelization. If not None, this will override the n_jobs and memory_limit parameters. If None, will create a new client with num_workers=n_jobs and memory_limit=memory_limit. + A dask client to use for parallelization. If not None, this will override the n_jobs and memory_limit parameters. If None, will create a new client with num_workers=n_jobs and memory_limit=memory_limit. survival_percentage : float, default=1 - Percentage of the population size to utilize for mutation and crossover at the beginning of the generation. The rest are discarded. Individuals are selected with the selector passed into survival_selector. The value of this parameter must be between 0 and 1, inclusive. + Percentage of the population size to utilize for mutation and crossover at the beginning of the generation. The rest are discarded. Individuals are selected with the selector passed into survival_selector. The value of this parameter must be between 0 and 1, inclusive. For example, if the population size is 100 and the survival percentage is .5, 50 individuals will be selected with NSGA2 from the existing population. These will be used for mutation and crossover to generate the next 100 individuals for the next generation. The remainder are discarded from the live population. In the next generation, there will now be the 50 parents + the 100 individuals for a total of 150. Surivival percentage is based of the population size parameter and not the existing population size (current population size when using successive halving). Therefore, in the next generation we will still select 50 individuals from the currently existing 150. crossover_probability : float, default=.2 Probability of generating a new individual by crossover between two individuals. @@ -170,12 +174,12 @@ def __init__( self, selection_evaluation_early_stop : list, default=None A lower and upper percent of the population size to select each round of CV. Values between 0 and 1. - selection_evaluation_scaling : float, default=0.5 + selection_evaluation_scaling : float, default=0.5 A scaling factor to use when determining how fast we move the threshold moves from the start to end percentile. Must be greater than zero. Higher numbers will move the threshold to the end faster. evaluation_early_stop_steps : int, default=1 The number of steps that will be taken from the objective function. (e.g., the number of CV folds to evaluate) - final_score_strategy : str, default="mean" + final_score_strategy : str, default="mean" The strategy to use when determining the final score for an individual. "mean": The mean of all objective scores "last": The score returned by the last call. Currently each objective is evaluated with a clone of the individual. @@ -192,16 +196,24 @@ def __init__( self, If provided, training will resume from this checkpoint. callback : tpot2.CallBackInterface, default=None Callback object. Not implemented + rng_ : Numpy.Random.Generator, None, default=None + An object for reproducability of experiments. This value will be passed to numpy.random.default_rng() to create an instnce of the genrator to pass to other classes + + - Numpy.Random.Generator + Will be used to create and lock in Generator instance with 'numpy.random.default_rng()'. Note this will be the same Generator passed in. + - None + Will be used to create Generator for 'numpy.random.default_rng()' where a fresh, unpredictable entropy will be pulled from the OS """ + self.rng = np.random.default_rng(rng_) if threshold_evaluation_early_stop is not None or selection_evaluation_early_stop is not None: if evaluation_early_stop_steps is None: raise ValueError("evaluation_early_stop_steps must be set when using threshold_evaluation_early_stop or selection_evaluation_early_stop") - self.individual_generator = individual_generator - self.population_size = population_size - self.objective_functions = objective_functions + self.individual_generator = individual_generator + self.population_size = population_size + self.objective_functions = objective_functions self.objective_function_weights = np.array(objective_function_weights) self.bigger_is_better = bigger_is_better if not bigger_is_better: @@ -220,32 +232,32 @@ def __init__( self, self.periodic_checkpoint_folder = periodic_checkpoint_folder - self.verbose = verbose - self.callback = callback - self.generations = generations + self.verbose = verbose + self.callback = callback + self.generations = generations self.n_jobs = n_jobs - + if max_time_seconds is None: self.max_time_seconds = float("inf") else: - self.max_time_seconds = max_time_seconds - + self.max_time_seconds = max_time_seconds + #functools requires none for infinite time, doesn't support inf if max_eval_time_seconds is not None and math.isinf(max_eval_time_seconds ): self.max_eval_time_seconds = None else: self.max_eval_time_seconds = max_eval_time_seconds - - + + self.generation = 0 self.threshold_evaluation_early_stop =threshold_evaluation_early_stop - self.threshold_evaluation_scaling = max(0.00001,threshold_evaluation_scaling ) + self.threshold_evaluation_scaling = max(0.00001,threshold_evaluation_scaling ) self.min_history_threshold = min_history_threshold self.selection_evaluation_early_stop = selection_evaluation_early_stop @@ -266,7 +278,7 @@ def __init__( self, self.survival_selector=survival_selector self.parent_selector=parent_selector self.survival_percentage = survival_percentage - + total_var_p = crossover_probability + mutate_probability + mutate_then_crossover_probability + crossover_then_mutate_probability self.crossover_probability = crossover_probability / total_var_p self.mutate_probability = mutate_probability / total_var_p @@ -324,7 +336,7 @@ def __init__( self, self.budget = self.budget_list[self.generation] else: self.budget = None - + self.early_stop_tol = early_stop_tol self.early_stop = early_stop @@ -343,7 +355,7 @@ def __init__( self, if os.path.exists(self.population_file): self.population = pickle.load(open(self.population_file, "rb")) - if len(self.population.evaluated_individuals)>0 and "Generation" in self.population.evaluated_individuals.columns: + if len(self.population.evaluated_individuals)>0 and "Generation" in self.population.evaluated_individuals.columns: self.generation = self.population.evaluated_individuals['Generation'].max() + 1 #TODO check if this is empty? init_names = self.objective_names @@ -352,7 +364,7 @@ def __init__( self, if self.population is None: self.population = tpot2.Population(column_names=init_names) initial_population = [next(self.individual_generator) for _ in range(self.cur_population_size)] - self.population.add_to_population(initial_population) + self.population.add_to_population(initial_population, self.rng) self.population.update_column(self.population.population, column_names="Generation", data=self.generation) @@ -374,14 +386,14 @@ def optimize(self, generations=None): processes=True, memory_limit=self.memory_limit) self._client = Client(self._cluster) - + if generations is None: generations = self.generations - start_time = time.time() - + start_time = time.time() + generations_without_improvement = np.array([0 for _ in range(len(self.objective_function_weights))]) best_scores = [-np.inf for _ in range(len(self.objective_function_weights))] @@ -389,7 +401,7 @@ def optimize(self, generations=None): self.scheduled_timeout_time = time.time() + self.max_time_seconds - try: + try: #for gen in tnrange(generations,desc="Generation", disable=self.verbose<1): done = False gen = 0 @@ -407,7 +419,7 @@ def optimize(self, generations=None): self.evaluate_population() if self.population_file is not None: pickle.dump(self.population, open(self.population_file, "wb")) - + attempts = 2 while len(self.population.population) == 0 and attempts > 0: new_initial_population = [next(self.individual_generator) for _ in range(self.cur_population_size)] @@ -424,8 +436,8 @@ def optimize(self, generations=None): if time.time() - start_time > self.max_time_seconds: break self.step() - - if self.verbose >= 3: + + if self.verbose >= 3: sign = np.sign(self.objective_function_weights) valid_df = self.population.evaluated_individuals[~self.population.evaluated_individuals[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names]*sign cur_best_scores = valid_df.max(axis=0)*sign @@ -444,7 +456,7 @@ def optimize(self, generations=None): cur_best_scores = valid_df.max(axis=0) cur_best_scores = cur_best_scores.to_numpy() #cur_best_scores = self.population.get_column(self.population.population, column_names=self.objective_names).max(axis=0)*sign #TODO this assumes the current population is the best - + improved = ( np.array(cur_best_scores) - np.array(best_scores) >= np.array(self.early_stop_tol) ) not_improved = np.logical_not(improved) generations_without_improvement = generations_without_improvement * not_improved + not_improved #set to zero if not improved, else increment @@ -471,12 +483,12 @@ def optimize(self, generations=None): except KeyboardInterrupt: if self.verbose >= 3: print("KeyboardInterrupt") - + self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="INVALID") self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="TIMEOUT") - + if self.population_file is not None: pickle.dump(self.population, open(self.population_file, "wb")) @@ -504,43 +516,44 @@ def step(self,): if self.survival_selector is not None: n_survivors = max(1,int(self.cur_population_size*self.survival_percentage)) #always keep at least one individual - self.population.survival_select( selector=self.survival_selector, - weights=self.objective_function_weights, - columns_names=self.objective_names, - n_survivors=n_survivors, - inplace=True) - + self.population.survival_select( selector=self.survival_selector, + weights=self.objective_function_weights, + columns_names=self.objective_names, + n_survivors=n_survivors, + inplace=True, + rng_=self.rng,) + self.generate_offspring() self.evaluate_population() self.generation += 1 - + def generate_offspring(self, ): #your EA Algorithm goes here - parents = self.population.parent_select(selector=self.parent_selector, weights=self.objective_function_weights, columns_names=self.objective_names, k=self.cur_population_size, n_parents=2) + parents = self.population.parent_select(selector=self.parent_selector, weights=self.objective_function_weights, columns_names=self.objective_names, k=self.cur_population_size, n_parents=2, rng_=self.rng) p = np.array([self.crossover_probability, self.mutate_then_crossover_probability, self.crossover_then_mutate_probability, self.mutate_probability]) p = p / p.sum() - var_op_list = np.random.choice(["crossover", "mutate_then_crossover", "crossover_then_mutate", "mutate"], size=self.cur_population_size, p=p) + var_op_list = self.rng.choice(["crossover", "mutate_then_crossover", "crossover_then_mutate", "mutate"], size=self.cur_population_size, p=p) for i, op in enumerate(var_op_list): if op == "mutate": parents[i] = parents[i][0] #mutations take a single individual - - offspring = self.population.create_offspring2(parents, var_op_list, self.mutation_functions, self.mutation_function_weights, self.crossover_functions, self.crossover_function_weights, add_to_population=True, keep_repeats=False, mutate_until_unique=True) - + + offspring = self.population.create_offspring2(parents, var_op_list, self.mutation_functions, self.mutation_function_weights, self.crossover_functions, self.crossover_function_weights, add_to_population=True, keep_repeats=False, mutate_until_unique=True, rng_=self.rng) + self.population.update_column(offspring, column_names="Generation", data=self.generation, ) - - + + # Gets a list of unevaluated individuals in the livepopulation, evaluates them, and removes failed attempts # TODO This could probably be an independent function? def evaluate_population(self,): - - #Update the sliding scales and thresholds + + #Update the sliding scales and thresholds # Save population, TODO remove some of these if self.population_file is not None: # and time.time() - last_save_time > 60*10: pickle.dump(self.population, open(self.population_file, "wb")) @@ -554,8 +567,8 @@ def evaluate_population(self,): old_data = old_data[old_data[self.objective_names].notnull().all(axis=1)] if len(old_data) >= self.min_history_threshold: self.thresholds = np.array([get_thresholds(old_data[obj_name], - start=self.threshold_evaluation_early_stop[0], - end=self.threshold_evaluation_early_stop[1], + start=self.threshold_evaluation_early_stop[0], + end=self.threshold_evaluation_early_stop[1], scale=self.threshold_evaluation_scaling, n=self.evaluation_early_stop_steps) for obj_name in self.objective_names]).T @@ -565,7 +578,7 @@ def evaluate_population(self,): lower = self.cur_population_size*self.selection_evaluation_early_stop[0] upper = self.cur_population_size*self.selection_evaluation_early_stop[1] #survival_counts = self.cur_population_size*(scipy.special.betainc(1,self.selection_evaluation_scaling,np.linspace(0,1,self.evaluation_early_stop_steps))*(upper-lower)+lower) - + survival_counts = np.array(beta_interpolation(start=lower, end=upper, scale=self.selection_evaluation_scaling, n=self.evaluation_early_stop_steps, n_steps=self.evaluation_early_stop_steps)) self.survival_counts = survival_counts.astype(int) else: @@ -591,7 +604,7 @@ def evaluate_population(self,): def evaluate_population_full(self, budget=None): individuals_to_evaluate = self.get_unevaluated_individuals(self.objective_names, budget=budget,) - + #print("evaluating this many individuals: ", len(individuals_to_evaluate)) if len(individuals_to_evaluate) == 0: @@ -608,7 +621,7 @@ def evaluate_population_full(self, budget=None): parallel_timeout = min(theoretical_timeout, scheduled_timeout_time_left) if parallel_timeout < 0: parallel_timeout = 10 - + #scores = tpot2.utils.eval_utils.parallel_eval_objective_list(individuals_to_evaluate, self.objective_functions, self.n_jobs, verbose=self.verbose, timeout=self.max_eval_time_seconds, budget=budget, n_expected_columns=len(self.objective_names), client=self._client, parallel_timeout=parallel_timeout, **self.objective_kwargs) scores, start_times, end_times = tpot2.utils.eval_utils.parallel_eval_objective_list2(individuals_to_evaluate, self.objective_functions, verbose=self.verbose, max_eval_time_seconds=self.max_eval_time_seconds, budget=budget, n_expected_columns=len(self.objective_names), client=self._client, **self.objective_kwargs) @@ -638,7 +651,7 @@ def get_unevaluated_individuals(self, column_names, budget=None, individual_list unevaluated_filter = lambda i: any(offspring_scores.loc[offspring_scores.index[i]][column_names].isna()) unevaluated_individuals_this_step = [i for i in range(len(cur_pop)) if unevaluated_filter(i)] return cur_pop[unevaluated_individuals_this_step] - + else: #if column names are not in the evaluated_individuals, then we have not evaluated any individuals yet for name_step in column_names: self.population.evaluated_individuals[name_step] = np.nan @@ -655,23 +668,23 @@ def evaluate_population_selection_early_stop(self,survival_counts, thresholds=No cur_individuals = self.population.population.copy() - + all_step_names = [] for step in range(self.evaluation_early_stop_steps): if budget is None: this_step_names = [f"{n}_step_{step}" for n in self.objective_names] else: this_step_names = [f"{n}_budget_{budget}_step_{step}" for n in self.objective_names] - + all_step_names.append(this_step_names) - + unevaluated_individuals_this_step = self.get_unevaluated_individuals(this_step_names, budget=None, individual_list=cur_individuals) if len(unevaluated_individuals_this_step) == 0: if self.verbose > 3: print("No new individuals to evaluate") continue - + if self.max_eval_time_seconds is not None: theoretical_timeout = self.max_eval_time_seconds * math.ceil(len(unevaluated_individuals_this_step) / self.n_jobs) theoretical_timeout = theoretical_timeout*2 @@ -753,9 +766,9 @@ def evaluate_population_selection_early_stop(self,survival_counts, thresholds=No invalids.append(i) if len(invalids) > 0: - + max_to_remove = min(len(cur_individuals) - self.n_jobs, len(invalids)) - + if max_to_remove < len(invalids): invalids = np.random.choice(invalids, max_to_remove, replace=False) @@ -771,5 +784,3 @@ def evaluate_population_selection_early_stop(self,survival_counts, thresholds=No new_population_index = survival_selector(weighted_scores, k=k) cur_individuals = np.array(cur_individuals)[new_population_index] - - diff --git a/tpot2/evolvers/steady_state_evolver.py b/tpot2/evolvers/steady_state_evolver.py index 23abdfe9..a45e4059 100644 --- a/tpot2/evolvers/steady_state_evolver.py +++ b/tpot2/evolvers/steady_state_evolver.py @@ -23,9 +23,9 @@ import warnings class SteadyStateEvolver(): - def __init__( self, + def __init__( self, individual_generator , - + objective_functions, objective_function_weights, objective_names = None, @@ -34,19 +34,19 @@ def __init__( self, initial_population_size = 50, population_size = 50, - max_evaluated_individuals = None, + max_evaluated_individuals = None, early_stop = None, early_stop_seconds = None, early_stop_tol = 0.001, - - max_time_seconds=float("inf"), + + max_time_seconds=float("inf"), max_eval_time_seconds=60*5, n_jobs=1, memory_limit="4GB", client=None, - + crossover_probability=.2, mutate_probability=.7, @@ -56,24 +56,28 @@ def __init__( self, survival_selector = survival_select_NSGA2, parent_selector = tournament_selection_dominated, - - budget_range = None, - budget_scaling = .5, - individuals_until_end_budget = 1, + + budget_range = None, + budget_scaling = .5, + individuals_until_end_budget = 1, stepwise_steps = 5, - - verbose = 0, + + verbose = 0, periodic_checkpoint_folder = None, callback = None, + + rng_=None ) -> None: + self.rng = np.random.default_rng(rng_) + self.max_evaluated_individuals = max_evaluated_individuals self.individuals_until_end_budget = individuals_until_end_budget - self.individual_generator = individual_generator - self.population_size = population_size - self.objective_functions = objective_functions + self.individual_generator = individual_generator + self.population_size = population_size + self.objective_functions = objective_functions self.objective_function_weights = np.array(objective_function_weights) self.bigger_is_better = bigger_is_better if not bigger_is_better: @@ -83,15 +87,15 @@ def __init__( self, self.periodic_checkpoint_folder = periodic_checkpoint_folder - self.verbose = verbose - self.callback = callback + self.verbose = verbose + self.callback = callback self.n_jobs = n_jobs - + if max_time_seconds is None: self.max_time_seconds = float("inf") else: - self.max_time_seconds = max_time_seconds - + self.max_time_seconds = max_time_seconds + #functools requires none for infinite time, doesn't support inf if max_eval_time_seconds is not None and math.isinf(max_eval_time_seconds ): self.max_eval_time_seconds = None @@ -111,7 +115,7 @@ def __init__( self, self.survival_selector=survival_selector self.parent_selector=parent_selector - + total_var_p = crossover_probability + mutate_probability + mutate_then_crossover_probability + crossover_then_mutate_probability self.crossover_probability = crossover_probability / total_var_p self.mutate_probability = mutate_probability / total_var_p @@ -145,7 +149,7 @@ def __init__( self, self.budget = self.budget_list[self.generation] else: self.budget = None - + self.early_stop_tol = early_stop_tol self.early_stop_seconds = early_stop_seconds @@ -172,7 +176,7 @@ def __init__( self, if self.population is None: self.population = tpot2.Population(column_names=init_names) initial_population = [next(self.individual_generator) for _ in range(self.initial_population_size)] - self.population.add_to_population(initial_population) + self.population.add_to_population(initial_population, rng_=self.rng) def optimize(self): @@ -194,7 +198,7 @@ def optimize(self): processes=False, memory_limit=self.memory_limit) self._client = Client(self._cluster) - + self.max_queue_size = len(self._client.cluster.workers) @@ -209,11 +213,11 @@ def optimize(self): submitted_futures = {} submitted_inds = set() - start_time = time.time() - - try: - - + start_time = time.time() + + try: + + if self.verbose >= 1: if self.max_evaluated_individuals is not None: pbar = tqdm.tqdm(total=self.max_evaluated_individuals, miniters=1) @@ -228,7 +232,7 @@ def optimize(self): if len(submitted_futures) >= self.max_queue_size: break future = self._client.submit(tpot2.utils.eval_utils.eval_objective_list, individual, self.objective_functions, verbose=self.verbose, timeout=self.max_eval_time_seconds,**self.objective_kwargs) - + submitted_futures[future] = {"individual": individual, "time": time.time(), "budget": budget,} @@ -238,7 +242,7 @@ def optimize(self): done = False start_time = time.time() while not done: - + ############################### # Step 1: Check for finished futures ############################### @@ -253,8 +257,8 @@ def optimize(self): #Loop through all futures, collect completed and timeout futures. for completed_future in list(submitted_futures.keys()): - - #get scores and update + + #get scores and update if completed_future.done(): #if future is done #If the future is done but threw and error, record the error if completed_future.exception() or completed_future.status == "error": #if the future is done and threw an error @@ -277,18 +281,18 @@ def optimize(self): print("cancelld ", completed_future.cancelled()) scores = ["INVALID" for _ in range(len(self.objective_names))] else: #if future is not done - + #check if the future has been running for too long, cancel the future if time.time() - submitted_futures[completed_future]["time"] > self.max_eval_time_seconds*1.25: completed_future.cancel() - + if self.verbose >= 4: print(f'WARNING AN INDIVIDUAL TIMED OUT (Fallback): \n {submitted_futures[completed_future]} \n') - + scores = ["TIMEOUT" for _ in range(len(self.objective_names))] else: continue #otherwise, continue to next future - + #update population @@ -310,15 +314,15 @@ def optimize(self): #now we have a list of completed futures - + self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="INVALID") self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="TIMEOUT") - - + + ############################### # Step 2: Early Stopping ############################### - if self.verbose >= 3: + if self.verbose >= 3: sign = np.sign(self.objective_function_weights) valid_df = self.population.evaluated_individuals[~self.population.evaluated_individuals[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names]*sign cur_best_scores = valid_df.max(axis=0)*sign @@ -335,13 +339,13 @@ def optimize(self): cur_best_scores = valid_df.max(axis=0) cur_best_scores = cur_best_scores.to_numpy() #cur_best_scores = self.population.get_column(self.population.population, column_names=self.objective_names).max(axis=0)*sign #TODO this assumes the current population is the best - + improved = ( np.array(cur_best_scores) - np.array(best_scores) >= np.array(self.early_stop_tol) ) not_improved = np.logical_not(improved) generations_without_improvement = generations_without_improvement * not_improved + not_improved #set to zero if not improved, else increment - + timestamp_of_last_improvement = timestamp_of_last_improvement * not_improved + time.time()*improved #set to current time if improved - + pass #update best score best_scores = [max(best_scores[i], cur_best_scores[i]) for i in range(len(self.objective_names))] @@ -351,7 +355,7 @@ def optimize(self): if self.verbose >= 3: print(f"Early stop ({self.early_stop} individuals evaluated without improvement)") break - + if self.early_stop_seconds: if any(time.time() - timestamp_of_last_improvement > self.early_stop_seconds): if self.verbose >= 3: @@ -364,7 +368,7 @@ def optimize(self): print("Time limit reached") done = True break - + if self.max_evaluated_individuals is not None and len(self.population.evaluated_individuals.dropna(subset=self.objective_names)) >= self.max_evaluated_individuals: print("Evaluated enough individuals") done = True @@ -378,7 +382,7 @@ def optimize(self): for individual in individuals_to_evaluate: if self.max_queue_size > len(submitted_futures): future = self._client.submit(tpot2.utils.eval_utils.eval_objective_list, individual, self.objective_functions, verbose=self.verbose, timeout=self.max_eval_time_seconds,**self.objective_kwargs) - + submitted_futures[future] = {"individual": individual, "time": time.time(), "budget": budget,} @@ -400,13 +404,13 @@ def optimize(self): if len(cur_evaluated_population) > self.population_size: scores = evaluated[self.objective_names].to_numpy() weighted_scores = scores * self.objective_function_weights - new_population_index = np.ravel(self.survival_selector(weighted_scores, k=self.population_size)) #TODO make it clear that we are concatenating scores... - + new_population_index = np.ravel(self.survival_selector(weighted_scores, k=self.population_size, rng_=self.rng)) #TODO make it clear that we are concatenating scores... + #set new population try: cur_evaluated_population = np.array(cur_evaluated_population)[new_population_index] cur_evaluated_population = np.concatenate([cur_evaluated_population, unevaluated["Individual"].to_numpy()]) - self.population.set_population(cur_evaluated_population) + self.population.set_population(cur_evaluated_population, rng_=self.rng) except Exception as e: print("Exception in survival selection") print(e) @@ -419,7 +423,7 @@ def optimize(self): print("self.objective_function_weights", self.objective_function_weights) print("self.population_size", self.population_size) print("parents_df", parents_df) - + ############################### # Step 5: Parent Selection and Variation ############################### @@ -438,21 +442,21 @@ def optimize(self): if len(parents_df) < 2: var_ops = ["mutate" for _ in range(n_individuals_to_submit)] else: - var_ops = [np.random.choice(["crossover","mutate_then_crossover","crossover_then_mutate",'mutate'],p=[self.crossover_probability,self.mutate_then_crossover_probability, self.crossover_then_mutate_probability,self.mutate_probability]) for _ in range(n_individuals_to_submit)] + var_ops = [self.rng.choice(["crossover","mutate_then_crossover","crossover_then_mutate",'mutate'],p=[self.crossover_probability,self.mutate_then_crossover_probability, self.crossover_then_mutate_probability,self.mutate_probability]) for _ in range(n_individuals_to_submit)] parents = [] for op in var_ops: if op == "mutate": - parents.extend(np.array(cur_evaluated_population)[self.parent_selector(weighted_scores, k=1, n_parents=1, )]) + parents.extend(np.array(cur_evaluated_population)[self.parent_selector(weighted_scores, k=1, n_parents=1, rng_=self.rng)]) else: - parents.extend(np.array(cur_evaluated_population)[self.parent_selector(weighted_scores, k=1, n_parents=2, )]) + parents.extend(np.array(cur_evaluated_population)[self.parent_selector(weighted_scores, k=1, n_parents=2, rng_=self.rng)]) + + _offspring = self.population.create_offspring(parents, var_ops, rng_=self.rng, n_jobs=1, add_to_population=True) - _offspring = self.population.create_offspring(parents, var_ops, n_jobs=1, add_to_population=True) - # If we don't have enough evaluated individuals to use as parents for variation, we create new individuals randomly # This can happen if the individuals in the initial population are invalid if len(cur_evaluated_population) == 0 and len(submitted_futures) < self.max_queue_size: - + initial_population = self.population.evaluated_individuals.iloc[:self.initial_population_size*3] invalid_initial_population = initial_population[initial_population[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)] if len(invalid_initial_population) >= self.initial_population_size*3: #if all individuals in the 3*initial population are invalid @@ -462,7 +466,7 @@ def optimize(self): initial_population = [next(self.individual_generator) for _ in range(n_individuals_to_create)] self.population.add_to_population(initial_population) - + ############################### @@ -473,7 +477,7 @@ def optimize(self): for individual in individuals_to_evaluate: if self.max_queue_size > len(submitted_futures): future = self._client.submit(tpot2.utils.eval_utils.eval_objective_list, individual, self.objective_functions, verbose=self.verbose, timeout=self.max_eval_time_seconds,**self.objective_kwargs) - + submitted_futures[future] = {"individual": individual, "time": time.time(), "budget": budget,} @@ -494,7 +498,7 @@ def optimize(self): ############################### # Step 7: Cleanup ############################### - + self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="INVALID") self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="TIMEOUT") @@ -513,7 +517,7 @@ def optimize(self): tpot2.utils.get_pareto_frontier(self.population.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights, invalid_values=["TIMEOUT","INVALID"]) - + def get_unevaluated_individuals(self, column_names, budget=None, individual_list=None): if individual_list is not None: @@ -531,10 +535,8 @@ def get_unevaluated_individuals(self, column_names, budget=None, individual_list unevaluated_filter = lambda i: any(offspring_scores.loc[offspring_scores.index[i]][column_names].isna()) unevaluated_individuals_this_step = [i for i in range(len(cur_pop)) if unevaluated_filter(i)] return cur_pop[unevaluated_individuals_this_step] - + else: #if column names are not in the evaluated_individuals, then we have not evaluated any individuals yet for name_step in column_names: self.population.evaluated_individuals[name_step] = np.nan return cur_pop - - \ No newline at end of file diff --git a/tpot2/individual_representations/graph_pipeline_individual/graph_utils/graph_utils.py b/tpot2/individual_representations/graph_pipeline_individual/graph_utils/graph_utils.py index bf96a8dc..1956d49d 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/graph_utils/graph_utils.py +++ b/tpot2/individual_representations/graph_pipeline_individual/graph_utils/graph_utils.py @@ -1,5 +1,5 @@ import networkx as nx -import random +import numpy as np def remove_and_stitch(graph, node): @@ -53,9 +53,11 @@ def invert_dictionary(d): for k, v in d.items(): inv_map.setdefault(v, set()).add(k) - return inv_map + return inv_map + +def select_nodes_same_depth(g1, node1, g2, node2, rng_=None): + rng = np.random.default_rng(rng_) -def select_nodes_same_depth(g1, node1, g2, node2): g1_nodes = nx.shortest_path_length(g1, source=node1) g2_nodes = nx.shortest_path_length(g2, source=node2) @@ -79,17 +81,19 @@ def select_nodes_same_depth(g1, node1, g2, node2): for n2 in g2_nodes[i]: possible_pairs.append( (n1,n2) ) - random.shuffle(possible_pairs) + rng.shuffle(possible_pairs) for p in possible_pairs: yield p[0], p[1] -def select_nodes_randomly(g1, g2,): +def select_nodes_randomly(g1, g2, rng_=None): + rng = np.random.default_rng(rng_) + sorted_self_nodes_list = list(g1.nodes) - random.shuffle(sorted_self_nodes_list) + rng.shuffle(sorted_self_nodes_list) sorted_other_nodes_list = list(g2.nodes) - random.shuffle(sorted_other_nodes_list) + rng.shuffle(sorted_other_nodes_list) for node1 in sorted_self_nodes_list: for node2 in sorted_other_nodes_list: yield node1, node2 \ No newline at end of file diff --git a/tpot2/individual_representations/graph_pipeline_individual/individual.py b/tpot2/individual_representations/graph_pipeline_individual/individual.py index a611755c..8ea3e0f2 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/individual.py +++ b/tpot2/individual_representations/graph_pipeline_individual/individual.py @@ -1,5 +1,4 @@ import numpy as np -import random from tpot2 import config import networkx as nx from abc import abstractmethod @@ -31,7 +30,7 @@ def __init__(self, *, self.label = label self._params = None - + from functools import partial #@https://stackoverflow.com/questions/20530455/isomorphic-comparison-of-networkx-graph-objects-instead-of-the-default-address @@ -73,22 +72,22 @@ def node_match(n1,n2, matched_labels): class GraphIndividual(BaseIndividual): ''' - An individual that contains a template for a graph sklearn pipeline. + An individual that contains a template for a graph sklearn pipeline. Parameters ---------- - root_config_dict : {dict with format {method class: param_function}} + root_config_dict : {dict with format {method class: param_function}} A dictionary of methods and functions that return a dictionary of hyperparameters. Used to create the root node of the graph. - inner_config_dict : {dict with format {method class: param_function}} - A dictionary of methods and functions that return a dictionary of hyperparameters. + inner_config_dict : {dict with format {method class: param_function}} + A dictionary of methods and functions that return a dictionary of hyperparameters. Used to create the inner nodes of the graph. If None, uses root_config_dict. - leaf_config_dict : {dict with format {method class: param_function}} + leaf_config_dict : {dict with format {method class: param_function}} A dictionary of methods and functions that return a dictionary of hyperparameters. - Used to create the leaf nodes of the graph. If not None, then all leafs must be created from this dictionary. + Used to create the leaf nodes of the graph. If not None, then all leafs must be created from this dictionary. Otherwise leaves will be created from inner_config_dict. initial_graph : (nx.DiGraph or list): - A graph to initialize the individual with. + A graph to initialize the individual with. If a list, it will initialize a linear graph with the methods in the list in the sequence provided. If the items in the list are dictionaries, nodes will be itialized with those dictionaries. Strings in the list correspond to the default configuration files. They can be 'Selector', 'Regressor', 'Transformer', 'Classifier'. @@ -108,25 +107,28 @@ class GraphIndividual(BaseIndividual): def __init__( self, root_config_dict, - inner_config_dict=None, + inner_config_dict=None, leaf_config_dict=None, initial_graph = None, - max_size = np.inf, + max_size = np.inf, linear_pipeline = False, name=None, crossover_same_depth = False, crossover_same_recursive_depth = True, - + hyperparameter_probability = 1, hyper_node_probability = 0, hyperparameter_alpha = 1, unique_subset_values = None, initial_subset_values = None, + rng_=None, ): self.__debug = False + rng = np.random.default_rng(rng_) + self.root_config_dict = root_config_dict self.inner_config_dict = inner_config_dict self.leaf_config_dict = leaf_config_dict @@ -146,35 +148,35 @@ def __init__( self.hyperparameter_alpha = hyperparameter_alpha if self.unique_subset_values is not None: - self.row_subset_selector = tpot2.representations.SubsetSelector(values=unique_subset_values, initial_set=initial_subset_values,k=20) + self.row_subset_selector = tpot2.representations.SubsetSelector(rng_=rng, values=unique_subset_values, initial_set=initial_subset_values,k=20) if isinstance(initial_graph, nx.DiGraph): self.graph = initial_graph self.root = list(nx.topological_sort(self.graph))[0] if self.leaf_config_dict is not None and len(self.graph.nodes) == 1: - first_leaf = create_node(self.leaf_config_dict) + first_leaf = create_node(self.leaf_config_dict, rng_=rng) self.graph.add_edge(self.root,first_leaf) elif isinstance(initial_graph, list): node_list = [] for item in initial_graph: if isinstance(item, dict): - node_list.append(create_node(item)) + node_list.append(create_node(item, rng_=rng)) elif isinstance(item, str): if item == 'Selector': from tpot2.config import selector_config_dictionary - node_list.append(create_node(selector_config_dictionary)) + node_list.append(create_node(selector_config_dictionary, rng_=rng)) elif item == 'Regressor': from tpot2.config import regressor_config_dictionary - node_list.append(create_node(regressor_config_dictionary)) + node_list.append(create_node(regressor_config_dictionary, rng_=rng)) elif item == 'Transformer': from tpot2.config import transformer_config_dictionary - node_list.append(create_node(transformer_config_dictionary)) - elif item == 'Classifier': + node_list.append(create_node(transformer_config_dictionary, rng_=rng)) + elif item == 'Classifier': from tpot2.config import classifier_config_dictionary - node_list.append(create_node(classifier_config_dictionary)) - + node_list.append(create_node(classifier_config_dictionary, rng_=rng)) + self.graph = nx.DiGraph() for child, parent in zip(node_list, node_list[1:]): self.graph.add_edge(parent, child) @@ -183,26 +185,26 @@ def __init__( else: self.graph = nx.DiGraph() - - self.root = create_node(self.root_config_dict) + + self.root = create_node(self.root_config_dict, rng_=rng) self.graph.add_node(self.root) if self.leaf_config_dict is not None: - first_leaf = create_node(self.leaf_config_dict) + first_leaf = create_node(self.leaf_config_dict, rng_=rng) self.graph.add_edge(self.root,first_leaf) - - self.initialize_all_nodes() + + self.initialize_all_nodes(rng_=rng) #self.root =list(nx.topological_sort(self.graph))[0] self.mutate_methods_list = [self._mutate_hyperparameters, - self._mutate_replace_node, + self._mutate_replace_node, self._mutate_remove_node, ] - + self.crossover_methods_list = [ self._crossover_swap_branch, ] @@ -240,17 +242,19 @@ def select_config_dict(self, node): return self.inner_config_dict - def initialize_all_nodes(self,): + def initialize_all_nodes(self, rng_=None): + rng = np.random.default_rng(rng_) for node in self.graph: if isinstance(node,GraphIndividual): continue if node.method_class is None: - node.method_class = random.choice(list(self.select_config_dict(node).keys())) + node.method_class = rng.choice(list(self.select_config_dict(node).keys())) if node.hyperparameters is None: get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) - - def fix_noncompliant_leafs(self): + + def fix_noncompliant_leafs(self, rng_=None): + rng = np.random.default_rng(rng_) leafs = [node for node in self.graph.nodes if len(list(self.graph.successors(node)))==0] compliant_leafs = [] noncompliant_leafs = [] @@ -259,11 +263,11 @@ def fix_noncompliant_leafs(self): compliant_leafs.append(leaf) else: noncompliant_leafs.append(leaf) - + #find all good leafs. If no good leaves exist, create a new one if len(compliant_leafs) == 0: first_leaf = NodeLabel(config_dict=self.leaf_config_dict) - first_leaf.method_class = random.choice(list(first_leaf.config_dict.keys())) #TODO: check when there is no new method + first_leaf.method_class = rng.choice(list(first_leaf.config_dict.keys())) #TODO: check when there is no new method first_leaf.hyperparameters = first_leaf.config_dict[first_leaf.method_class](config.hyperparametersuggestor) get_hyperparameter(self.select_config_dict(first_leaf)[first_leaf.method_class], nodelabel=first_leaf, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) compliant_leafs.append(first_leaf) @@ -271,12 +275,12 @@ def fix_noncompliant_leafs(self): #connect bad leaves to good leaves (making them internal nodes) if len(noncompliant_leafs) > 0: for node in noncompliant_leafs: - self.graph.add_edge(node, random.choice(compliant_leafs)) + self.graph.add_edge(node, rng.choice(compliant_leafs)) - def _merge_duplicated_nodes(self): + def _merge_duplicated_nodes(self): graph_changed = False merged = False @@ -292,7 +296,7 @@ def _merge_duplicated_nodes(self): node_children = set(self.graph.successors(node)) other_node_children = set(self.graph.successors(other_node)) #if nodes have identical children, they can be merged - if node_children == other_node_children: + if node_children == other_node_children: for other_node_parent in list(self.graph.predecessors(other_node)): if other_node_parent not in self.graph.predecessors(node): self.graph.add_edge(other_node_parent,node) @@ -318,7 +322,7 @@ def flatten_pipeline(self,depth=0): n1_p = flattened_full_graph.predecessors(node) remove_list.append(node) - + flattened_full_graph = nx.compose(flattened_full_graph, flattened) @@ -327,7 +331,7 @@ def flatten_pipeline(self,depth=0): flattened_full_graph.add_edges_from([ (n, n2) for n in n1_p for n2 in roots]) else: flattened_full_graph.nodes[node]['recursive depth'] = depth - + for node in remove_list: flattened_full_graph.remove_node(node) @@ -341,7 +345,7 @@ def flatten_pipeline(self,depth=0): flattened_full_graph.nodes[node]["subset_values"] = list(set(flattened_full_graph.nodes[node]["subset_values"]) & set(self.row_subset_selector.subsets)) return flattened_full_graph - + def get_num_nodes(self,): num_nodes = 0 @@ -355,7 +359,7 @@ def get_num_nodes(self,): def export_nested_pipeline(self, **graph_pipeline_args): - + flattened_full_graph = self.graph.copy() remove_list = [] for node in list(flattened_full_graph.nodes): @@ -366,7 +370,7 @@ def export_nested_pipeline(self, **graph_pipeline_args): n1_p = flattened_full_graph.predecessors(node) remove_list.append(node) - + flattened_full_graph.add_node(gp) @@ -376,14 +380,14 @@ def export_nested_pipeline(self, **graph_pipeline_args): for node in remove_list: flattened_full_graph.remove_node(node) - + estimator_graph = flattened_full_graph - + #mapping = {node:node.method_class(**node.hyperparameters) for node in estimator_graph} label_remapping = {} label_to_instance = {} - - for node in estimator_graph: + + for node in estimator_graph: found_unique_label = False i=1 while not found_unique_label: @@ -397,14 +401,14 @@ def export_nested_pipeline(self, **graph_pipeline_args): else: i+=1 - + if type(node) is tpot2.GraphPipeline: label_remapping[node] = label label_to_instance[label] = node else: label_remapping[node] = label label_to_instance[label] = node.method_class(**node.hyperparameters) - + estimator_graph = nx.relabel_nodes(estimator_graph, label_remapping) for label, instance in label_to_instance.items(): @@ -414,12 +418,12 @@ def export_nested_pipeline(self, **graph_pipeline_args): def export_pipeline(self, **graph_pipeline_args): estimator_graph = self.flatten_pipeline() - + #mapping = {node:node.method_class(**node.hyperparameters) for node in estimator_graph} label_remapping = {} label_to_instance = {} - - for node in estimator_graph: + + for node in estimator_graph: found_unique_label = False i=1 while not found_unique_label: @@ -431,7 +435,7 @@ def export_pipeline(self, **graph_pipeline_args): label_remapping[node] = label label_to_instance[label] = node.method_class(**node.hyperparameters) - + estimator_graph = nx.relabel_nodes(estimator_graph, label_remapping) for label, instance in label_to_instance.items(): @@ -460,7 +464,6 @@ def export_baikal(self,): if i == len(toposorted)-1: #last method doesn't need transformed. return baikal.Model(inputs=X, outputs=this_output, targets=y) - def plot(self): @@ -476,10 +479,10 @@ def plot(self): node_color = [plt.cm.Set1(G.nodes[n]['recursive depth']) for n in G] fig, ax = plt.subplots() - + nx.draw(G, pos, nodelist=nodelist, node_color=node_color, ax=ax, **options) - + '''edgelist = [] for n in n1.node_set: for child in n.children: @@ -514,15 +517,17 @@ def plot(self): ############# #TODO currently does not correctly return false when adding a leaf causes a duplicate node that is later merged - def mutate(self,): + def mutate(self, rng_=None): + rng = np.random.default_rng(rng_) self.key = None - graph = self.select_graphindividual() - return graph._mutate() + graph = self.select_graphindividual(rng_=rng) + return graph._mutate(rng_=rng) - def _mutate(self,): - random.shuffle(self.mutate_methods_list) + def _mutate(self, rng_=None): + rng = np.random.default_rng(rng_) + rng.shuffle(self.mutate_methods_list) for mutate_method in self.mutate_methods_list: - if mutate_method(): + if mutate_method(rng_=rng): self._merge_duplicated_nodes() if self.__debug: @@ -541,24 +546,26 @@ def _mutate(self,): try: nx.find_cycle(self.graph) print('something went wrong with ', mutate_method) - except: + except: pass - + return True - + return False - def _mutate_row_subsets(self,): + def _mutate_row_subsets(self, rng_=None): + rng = np.random.default_rng(rng_) if self.unique_subset_values is not None: - self.row_subset_selector.mutate() + self.row_subset_selector.mutate(rng_=rng) - def _mutate_hyperparameters(self): + def _mutate_hyperparameters(self, rng_=None): ''' Mutates the hyperparameters for a randomly chosen node in the graph. ''' + rng = np.random.default_rng(rng_) sorted_nodes_list = list(self.graph.nodes) - random.shuffle(sorted_nodes_list) + rng.shuffle(sorted_nodes_list) completed_one = False for node in sorted_nodes_list: if isinstance(node,GraphIndividual): @@ -567,55 +574,57 @@ def _mutate_hyperparameters(self): continue if not completed_one: - _,_, completed_one = get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) + _,_, completed_one = get_hyperparameter(self.select_config_dict(node)[node.method_class], rng_=rng, nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) else: - if self.hyper_node_probability > random.random(): - get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) + if self.hyper_node_probability > rng.random(): + get_hyperparameter(self.select_config_dict(node)[node.method_class], rng_=rng, nodelabel=node, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) return completed_one - - - def _mutate_replace_node(self): + + + def _mutate_replace_node(self, rng_=None): ''' Replaces the method in a randomly chosen node by a method from the available methods for that node. ''' + rng = np.random.default_rng(rng_) sorted_nodes_list = list(self.graph.nodes) - random.shuffle(sorted_nodes_list) + rng.shuffle(sorted_nodes_list) for node in sorted_nodes_list: if isinstance(node,GraphIndividual): continue - node.method_class = random.choice(list(self.select_config_dict(node).keys())) + node.method_class = rng.choice(list(self.select_config_dict(node).keys())) if isinstance(self.select_config_dict(node)[node.method_class], dict): hyperparameters = self.select_config_dict(node)[node.method_class] node.hyperparameters = hyperparameters - else: + else: #hyperparameters = self.select_config_dict(node)[node.method_class](config.hyperparametersuggestor) #get_hyperparameter(self.select_config_dict(node)[node.method_class], nodelabel=None, alpha=self.hyperparameter_alpha, hyperparameter_probability=self.hyperparameter_probability) - new_node = create_node(self.select_config_dict(node)) + new_node = create_node(self.select_config_dict(node), rng_=rng) #TODO cleanup node.hyperparameters = new_node.hyperparameters node.method_class = new_node.method_class node.label = new_node.label return True - + return False - def _mutate_remove_node(self): + def _mutate_remove_node(self, rng_=None): ''' Removes a randomly chosen node and connects its parents to its children. If the node is the only leaf for an inner node and 'leaf_config_dict' is not none, we do not remove it. ''' + rng = np.random.default_rng(rng_) nodes_list = list(self.graph.nodes) nodes_list.remove(self.root) leaves = graph_utils.get_leaves(self.graph) while len(nodes_list) > 0: - node = random.choices(nodes_list,)[0] + node = rng.choice(nodes_list) nodes_list.remove(node) if self.leaf_config_dict is not None and len(list(nx.descendants(self.graph,node))) == 0 : #if the node is a leaf @@ -635,55 +644,58 @@ def _mutate_remove_node(self): graph_utils.remove_and_stitch(self.graph, node) graph_utils.remove_nodes_disconnected_from_node(self.graph, self.root) return True - + return False - def _mutate_remove_edge(self): + def _mutate_remove_edge(self, rng_=None): ''' Deletes an edge as long as deleting that edge does not make the graph disconnected. ''' + rng = np.random.default_rng(rng_) sorted_nodes_list = list(self.graph.nodes) - random.shuffle(sorted_nodes_list) + rng.shuffle(sorted_nodes_list) for child_node in sorted_nodes_list: parents = list(self.graph.predecessors(child_node)) if len(parents) > 1: # if it has more than one parent, you can remove an edge (if this is the only child of a node, it will become a leaf) for parent_node in parents: # if removing the egde will make the parent_node a leaf node, skip - if self.leaf_config_dict is not None and len(list(self.graph.successors(parent_node))) < 2: + if self.leaf_config_dict is not None and len(list(self.graph.successors(parent_node))) < 2: continue self.graph.remove_edge(parent_node, child_node) return True return False - def _mutate_add_edge(self): + def _mutate_add_edge(self, rng_=None): ''' Randomly add an edge from a node to another node that is not an ancestor of the first node. ''' + rng = np.random.default_rng(rng_) sorted_nodes_list = list(self.graph.nodes) - random.shuffle(sorted_nodes_list) + rng.shuffle(sorted_nodes_list) for child_node in sorted_nodes_list: for parent_node in sorted_nodes_list: if self.leaf_config_dict is not None: if len(list(self.graph.successors(parent_node))) == 0: continue - + # skip if # - parent and child are the same node # - edge already exists # - child is an ancestor of parent - if (child_node is not parent_node) and not self.graph.has_edge(parent_node,child_node) and (child_node not in nx.ancestors(self.graph, parent_node)): + if (child_node is not parent_node) and not self.graph.has_edge(parent_node,child_node) and (child_node not in nx.ancestors(self.graph, parent_node)): self.graph.add_edge(parent_node,child_node) return True return False - def _mutate_insert_leaf(self): + def _mutate_insert_leaf(self, rng_=None): + rng = np.random.default_rng(rng_) if self.max_size > self.graph.number_of_nodes(): sorted_nodes_list = list(self.graph.nodes) - random.shuffle(sorted_nodes_list) #TODO: sort by number of children and/or parents? bias model one way or another + rng.shuffle(sorted_nodes_list) #TODO: sort by number of children and/or parents? bias model one way or another for node in sorted_nodes_list: #if leafs are protected, check if node is a leaf #if node is a leaf, skip because we don't want to add node on top of node @@ -691,15 +703,14 @@ def _mutate_insert_leaf(self): and len(list(self.graph.successors(node))) == 0 #if node is leaf and len(list(self.graph.predecessors(node))) > 0 #except if node is root, in which case we want to add a leaf even if it happens to be a leaf too ): - - + continue - + #If node *is* the root or is not a leaf, add leaf node. (dont want to add leaf on top of leaf) if self.leaf_config_dict is not None: - new_node = create_node(self.leaf_config_dict) + new_node = create_node(self.leaf_config_dict, rng_=rng) else: - new_node = create_node(self.inner_config_dict) + new_node = create_node(self.inner_config_dict, rng_=rng) self.graph.add_node(new_node) self.graph.add_edge(node, new_node) @@ -707,13 +718,14 @@ def _mutate_insert_leaf(self): return False - def _mutate_insert_bypass_node(self): + def _mutate_insert_bypass_node(self, rng_=None): + rng = np.random.default_rng(rng_) if self.max_size > self.graph.number_of_nodes(): sorted_nodes_list = list(self.graph.nodes) sorted_nodes_list2 = list(self.graph.nodes) - random.shuffle(sorted_nodes_list) #TODO: sort by number of children and/or parents? bias model one way or another - random.shuffle(sorted_nodes_list2) - for node in sorted_nodes_list: + rng.shuffle(sorted_nodes_list) #TODO: sort by number of children and/or parents? bias model one way or another + rng.shuffle(sorted_nodes_list2) + for node in sorted_nodes_list: for child_node in sorted_nodes_list2: if child_node is not node and child_node not in nx.ancestors(self.graph, node): if self.leaf_config_dict is not None: @@ -721,7 +733,7 @@ def _mutate_insert_bypass_node(self): if len(list(nx.descendants(self.graph,node))) ==0 : continue - new_node = create_node(config_dict = self.inner_config_dict) + new_node = create_node(config_dict = self.inner_config_dict, rng_=rng) self.graph.add_node(new_node) self.graph.add_edges_from([(node, new_node), (new_node, child_node)]) @@ -730,23 +742,24 @@ def _mutate_insert_bypass_node(self): return False - def _mutate_insert_inner_node(self): + def _mutate_insert_inner_node(self, rng_=None): + rng = np.random.default_rng(rng_) if self.max_size > self.graph.number_of_nodes(): sorted_nodes_list = list(self.graph.nodes) sorted_nodes_list2 = list(self.graph.nodes) - random.shuffle(sorted_nodes_list) #TODO: sort by number of children and/or parents? bias model one way or another - random.shuffle(sorted_nodes_list2) + rng.shuffle(sorted_nodes_list) #TODO: sort by number of children and/or parents? bias model one way or another + rng.shuffle(sorted_nodes_list2) for node in sorted_nodes_list: #loop through children of node for child_node in list(self.graph.successors(node)): - + if child_node is not node and child_node not in nx.ancestors(self.graph, node): if self.leaf_config_dict is not None: #If if we are protecting leafs, dont add connection into a leaf if len(list(nx.descendants(self.graph,node))) ==0 : continue - - new_node = create_node(config_dict = self.inner_config_dict) + + new_node = create_node(config_dict = self.inner_config_dict, rng_=rng) self.graph.add_node(new_node) self.graph.add_edges_from([(node, new_node), (new_node, child_node)]) @@ -769,7 +782,7 @@ def get_graphs(self): return graphs - + def _get_graphs(self, depth=1): graphs = [self] self.graph.graph['recursive depth'] = depth @@ -781,19 +794,30 @@ def _get_graphs(self, depth=1): return graphs - def select_graphindividual(self,): + def select_graphindividual(self, rng_=None): + rng = np.random.default_rng(rng_) graphs = self.get_graphs() weights = [g.graph.number_of_nodes() for g in graphs] - return random.choices(graphs, weights=weights)[0] + w_sum = sum(weights) + weights = [w / w_sum for w in weights] # generate probabilities based on sum of weights + return rng.choice(graphs, p=weights) + + + def select_graph_same_recursive_depth(self,ind1,ind2,rng_=None): + rng = np.random.default_rng(rng_) - def select_graph_same_recursive_depth(self,ind1,ind2): graphs1 = ind1.get_graphs() weights1 = [g.graph.number_of_nodes() for g in graphs1] + w1_sum = sum(weights1) + weights1 = [w / w1_sum for w in weights1] + graphs2 = ind2.get_graphs() weights2 = [g.graph.number_of_nodes() for g in graphs2] - - g1_sorted_graphs = random_weighted_sort(graphs1, weights1) - g2_sorted_graphs = random_weighted_sort(graphs2, weights2) + w2_sum = sum(weights2) + weights2 = [w / w2_sum for w in weights2] + + g1_sorted_graphs = random_weighted_sort(graphs1, weights1, rng) + g2_sorted_graphs = random_weighted_sort(graphs2, weights2, rng) for g1, g2 in zip(g1_sorted_graphs, g2_sorted_graphs): if g1.graph.graph['depth'] == g2.graph.graph['depth'] and g1.graph.graph['recursive depth'] == g2.graph.graph['recursive depth']: @@ -801,7 +825,7 @@ def select_graph_same_recursive_depth(self,ind1,ind2): return ind1,ind2 - def crossover(self, ind2): + def crossover(self, ind2, rng_=None): ''' self is the first individual, ind2 is the second individual If crossover_same_depth, it will select graphindividuals at the same recursive depth. @@ -809,25 +833,28 @@ def crossover(self, ind2): This does not impact graphs without subgraphs. And it does not impacts nodes that are not graphindividuals. Cros ''' - + + rng = np.random.default_rng(rng_) + self.key = None ind2.key = None if self.crossover_same_recursive_depth: # selects graphs from the same recursive depth and same depth from the root - g1, g2 = self.select_graph_same_recursive_depth(self, ind2) - - + g1, g2 = self.select_graph_same_recursive_depth(self, ind2, rng_=rng) + + else: - g1 = self.select_graphindividual() - g2 = ind2.select_graphindividual() - - return g1._crossover(g2) - - def _crossover(self, Graph): - - random.shuffle(self.crossover_methods_list) + g1 = self.select_graphindividual(rng_=rng) + g2 = ind2.select_graphindividual(rng_=rng) + + return g1._crossover(g2, rng_=rng) + + def _crossover(self, Graph, rng_=None): + rng = np.random.default_rng(rng_) + + rng.shuffle(self.crossover_methods_list) for crossover_method in self.crossover_methods_list: - if crossover_method(Graph): + if crossover_method(Graph, rng_=rng): self._merge_duplicated_nodes() return True @@ -835,35 +862,38 @@ def _crossover(self, Graph): try: nx.find_cycle(self.graph) print('something went wrong with ', crossover_method) - except: + except: pass return False - def _crossover_row_subsets(self, G2): + def _crossover_row_subsets(self, G2, rng_=None): + rng = np.random.default_rng(rng_) if self.unique_subset_values is not None and G2.unique_subset_values is not None: - self.row_subset_selector.crossover(G2.row_subset_selector) - + self.row_subset_selector.crossover(G2.row_subset_selector, rng_=rng) - def _crossover_swap_node(self, G2): + + def _crossover_swap_node(self, G2, rng_=None): ''' Swaps randomly chosen node from Parent1 with a randomly chosen node from Parent2. ''' + rng = np.random.default_rng(rng_) + if self.crossover_same_depth: - pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root) + pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root, rng_=rng) else: - pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph) + pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph, rng_=rng) for node1, node2 in pair_gen: if not (node1 is self.root or node2 is G2.root): #TODO: allow root - + n1_s = self.graph.successors(node1) n1_p = self.graph.predecessors(node1) n2_s = G2.graph.successors(node2) n2_p = G2.graph.predecessors(node2) - + self.graph.remove_node(node1) G2.graph.remove_node(node2) @@ -874,28 +904,30 @@ def _crossover_swap_node(self, G2): self.graph.add_edges_from([ (n, node2) for n in n1_p]) G2.graph.add_edges_from([ (n, node1) for n in n2_p]) - + return True return False - def _crossover_swap_branch(self, G2): + def _crossover_swap_branch(self, G2, rng_=None): ''' swaps a branch from parent1 with a branch from parent2. does not modify parent2 ''' + rng = np.random.default_rng(rng_) + if self.crossover_same_depth: - pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root) + pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root, rng_=rng) else: - pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph) + pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph, rng_=rng) for node1, node2 in pair_gen: #TODO: if root is in inner_config_dict, then do use it? if node1 is self.root or node2 is G2.root: #dont want to add root as inner node continue - - #check if node1 is a leaf and leafs are protected, don't add an input to the leave - if self.leaf_config_dict is not None: #if we are protecting leaves, + + #check if node1 is a leaf and leafs are protected, don't add an input to the leave + if self.leaf_config_dict is not None: #if we are protecting leaves, node1_is_leaf = len(list(self.graph.successors(node1))) == 0 node2_is_leaf = len(list(G2.graph.successors(node2))) == 0 #if not ((node1_is_leaf and node1_is_leaf) or (not node1_is_leaf and not node2_is_leaf)): #if node1 is a leaf @@ -929,14 +961,16 @@ def _crossover_swap_branch(self, G2): return False #TODO: Currently returns true even if hyperparameters are blank - def _crossover_hyperparameters(self, G2): + def _crossover_hyperparameters(self, G2, rng_=None): ''' Swaps the hyperparamters of one randomly chosen node in Parent1 with the hyperparameters of randnomly chosen node in Parent2. ''' + rng = np.random.default_rng(rng_) + if self.crossover_same_depth: - pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root) + pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root, rng_=rng) else: - pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph) + pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph, rng_=rng) for node1, node2 in pair_gen: if isinstance(node1,GraphIndividual) or isinstance(node2,GraphIndividual): @@ -952,15 +986,17 @@ def _crossover_hyperparameters(self, G2): #not including the nodes, just their children #Finds leaves attached to nodes and swaps them - def _crossover_swap_leaf_at_node(self, G2): + def _crossover_swap_leaf_at_node(self, G2, rng_=None): + rng = np.random.default_rng(rng_) + if self.crossover_same_depth: - pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root) + pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root, rng_=rng) else: - pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph) + pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph, rng_=rng) success = False for node1, node2 in pair_gen: - # if leaves are protected node1 and node2 must both be leaves or both be inner nodes + # if leaves are protected node1 and node2 must both be leaves or both be inner nodes if self.leaf_config_dict is not None and not (len(list(self.graph.successors(node1)))==0 ^ len(list(G2.graph.successors(node2)))==0): continue #self_leafs = [c for c in nx.descendants(self.graph,node1) if len(list(self.graph.successors(c)))==0 and c is not node1] @@ -975,7 +1011,7 @@ def _crossover_swap_leaf_at_node(self, G2): if len(node_leafs) >0: for c in node_leafs: - if random.choice([True,False]): + if rng.choice([True,False]): G2.graph.remove_node(c) self.graph.add_edge(node1, c) success = True @@ -983,22 +1019,24 @@ def _crossover_swap_leaf_at_node(self, G2): return success - def _crossover_take_branch(self, G2): + def _crossover_take_branch(self, G2, rng_=None): ''' Takes a subgraph from Parent2 and add it to a randomly chosen node in Parent1. ''' + rng = np.random.default_rng(rng_) + if self.crossover_same_depth: - pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root) + pair_gen = graph_utils.select_nodes_same_depth(self.graph, self.root, G2.graph, G2.root, rng_=rng) else: - pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph) + pair_gen = graph_utils.select_nodes_randomly(self.graph, G2.graph, rng_=rng) for node1, node2 in pair_gen: #TODO: if root is in inner_config_dict, then do use it? if node2 is G2.root: #dont want to add root as inner node continue - - #check if node1 is a leaf and leafs are protected, don't add an input to the leave + + #check if node1 is a leaf and leafs are protected, don't add an input to the leave if self.leaf_config_dict is not None and len(list(self.graph.successors(node1))) == 0: continue @@ -1027,21 +1065,23 @@ def _crossover_take_branch(self, G2): return False #TODO: swap all leaf nodes - def _crossover_swap_all_leafs(self, G2): + def _crossover_swap_all_leafs(self, G2, rng_=None): pass #TODO: currently ignores ensembles, make it include nodes inside of ensembles - def optimize(self, objective_function, steps=5): - random.shuffle(self.optimize_methods_list) #select an optimization method + def optimize(self, rng_, objective_function, steps=5): + rng = np.random.default_rng(rng_) + rng.shuffle(self.optimize_methods_list) #select an optimization method for optimize_method in self.optimize_methods_list: - if optimize_method(objective_function, steps=steps): + if optimize_method(rng, objective_function, steps=steps): return True #optimize the hyperparameters of one method to improve the entire pipeline - def _optimize_optuna_single_method_full_pipeline(self, objective_function, steps=5): + def _optimize_optuna_single_method_full_pipeline(self, rng_, objective_function, steps=5): + rng = np.random.default_rng(rng_) nodes_list = list(self.graph.nodes) - random.shuffle(nodes_list) #TODO: sort by number of children and/or parents? bias model one way or another + rng.shuffle(nodes_list) #TODO: sort by number of children and/or parents? bias model one way or another for node in nodes_list: if not isinstance(node, NodeLabel) or isinstance(self.select_config_dict(node)[node.method_class],dict): continue @@ -1051,7 +1091,7 @@ def _optimize_optuna_single_method_full_pipeline(self, objective_function, steps def objective(trial): params = self.select_config_dict(node)[node.method_class](trial) node.hyperparameters = params - + trial.set_user_attr('params', params) try: return objective_function(self) @@ -1064,7 +1104,7 @@ def objective(trial): #optimize the hyperparameters of all methods simultaneously to improve the entire pipeline - def _optimize_optuna_all_methods_full_pipeline(self, objective_function, steps=5): + def _optimize_optuna_all_methods_full_pipeline(self, rng_, objective_function, steps=5): nodes_list = list(self.graph.nodes) study = optuna.create_study() nodes_to_optimize = [] @@ -1080,9 +1120,9 @@ def objective(trial): params = self.select_config_dict(node)[node.method_class](trial, name=f'node_{i}') node.hyperparameters = params param_list.append(params) - + trial.set_user_attr('params', param_list) - + try: return objective_function(self) except: @@ -1095,8 +1135,8 @@ def objective(trial): node.hyperparameters = params return True - - + + def _cached_transform(cache_nunber=0): #use a cache for models at each CV fold? #cache just transformations at each fold? @@ -1114,8 +1154,8 @@ def unique_id(self) -> GraphKey: g.nodes[n]['label'] = {n.method_class: n.hyperparameters, "subset_values":g.nodes[n]["subset_values"]} else: g.nodes[n]['label'] = {n.method_class: n.hyperparameters} - - g.nodes[n]['method_class'] = n.method_class #TODO making this transformation doesn't feel very clean? + + g.nodes[n]['method_class'] = n.method_class #TODO making this transformation doesn't feel very clean? g.nodes[n]['hyperparameters'] = n.hyperparameters g = nx.convert_node_labels_to_integers(g) @@ -1130,20 +1170,21 @@ def full_node_list(self): node_list.pop(node_list.index(node)) node_list.extend(node.graph.nodes) return node_list - -def create_node(config_dict): + +def create_node(config_dict, rng_=None): ''' Takes a config_dict and returns a node with a random method_class and hyperparameters ''' - method_class = random.choice(list(config_dict.keys())) + rng = np.random.default_rng(rng_) + method_class = rng.choice(list(config_dict.keys())) #if method_class == GraphIndividual or method_class == 'Recursive': if method_class == 'Recursive': node = GraphIndividual(**config_dict[method_class]) else: - hyperparameters, params, _ = get_hyperparameter(config_dict[method_class], nodelabel=None) + hyperparameters, params, _ = get_hyperparameter(config_dict[method_class], rng_=rng, nodelabel=None) node = NodeLabel( method_class=method_class, @@ -1154,34 +1195,32 @@ def create_node(config_dict): return node - - -import random -def random_weighted_sort(l,weights): +def random_weighted_sort(l,weights, rng_=None): + rng = np.random.default_rng(rng_) sorted_l = [] indeces = {i: weights[i] for i in range(len(l))} while len(indeces) > 0: - next_item = random.choices(list(indeces.keys()), weights=list(indeces.values()))[0] + next_item = rng.choice(list(indeces.keys()), p=list(indeces.values())) indeces.pop(next_item) sorted_l.append(l[next_item]) - - return sorted_l + return sorted_l -def get_hyperparameter(config_func, nodelabel=None, alpha=1, hyperparameter_probability=1): +def get_hyperparameter(config_func, rng_, nodelabel=None, alpha=1, hyperparameter_probability=1): + rng = np.random.default_rng(rng_) changed = False if isinstance(config_func, dict): return config_func, None, changed if nodelabel is not None: - trial = config.hyperparametersuggestor.Trial(old_params=nodelabel._params, alpha=alpha, hyperparameter_probability=hyperparameter_probability) + trial = config.hyperparametersuggestor.Trial(rng_=rng, old_params=nodelabel._params, alpha=alpha, hyperparameter_probability=hyperparameter_probability) new_params = config_func(trial) changed = trial._params != nodelabel._params nodelabel._params = trial._params nodelabel.hyperparameters = new_params else: - trial = config.hyperparametersuggestor.Trial(old_params=None, alpha=alpha, hyperparameter_probability=hyperparameter_probability) + trial = config.hyperparametersuggestor.Trial(rng_=rng, old_params=None, alpha=alpha, hyperparameter_probability=hyperparameter_probability) new_params = config_func(trial) return new_params, trial._params, changed \ No newline at end of file diff --git a/tpot2/individual_representations/graph_pipeline_individual/templates.py b/tpot2/individual_representations/graph_pipeline_individual/templates.py index decc4570..c08d047a 100644 --- a/tpot2/individual_representations/graph_pipeline_individual/templates.py +++ b/tpot2/individual_representations/graph_pipeline_individual/templates.py @@ -3,7 +3,6 @@ import tpot2 import networkx as nx from tpot2.individual_representations.graph_pipeline_individual import GraphIndividual -import random from tpot2.individual_representations.graph_pipeline_individual.individual import create_node @@ -12,58 +11,59 @@ def estimator_graph_individual_generator( root_config_dict, inner_config_dict=None, leaf_config_dict=None, - max_size = np.inf, + max_size = np.inf, linear_pipeline = False, - hyperparameter_probability = 1, hyper_node_probability = 0, hyperparameter_alpha = 1, + rng_=None, **kwargs, ) : - + rng = np.random.default_rng(rng_) n_nodes = 0 while True: if n_nodes < max_size: n_nodes += 1 - + for k in root_config_dict.keys(): - + graph = nx.DiGraph() - root = create_node(config_dict={k:root_config_dict[k]}) + root = create_node(config_dict={k:root_config_dict[k]}, rng_=rng) graph.add_node(root) - - ind = GraphIndividual( inner_config_dict=inner_config_dict, - leaf_config_dict=leaf_config_dict, - root_config_dict=root_config_dict, - initial_graph = graph, - - max_size = max_size, - linear_pipeline = linear_pipeline, - hyperparameter_probability = hyperparameter_probability, - hyper_node_probability = hyper_node_probability, - hyperparameter_alpha = hyperparameter_alpha, - - **kwargs, - ) - + + ind = GraphIndividual( rng_=rng, + inner_config_dict=inner_config_dict, + leaf_config_dict=leaf_config_dict, + root_config_dict=root_config_dict, + initial_graph = graph, + + max_size = max_size, + linear_pipeline = linear_pipeline, + hyperparameter_probability = hyperparameter_probability, + hyper_node_probability = hyper_node_probability, + hyperparameter_alpha = hyperparameter_alpha, + + **kwargs, + ) + starting_ops = [] if inner_config_dict is not None: starting_ops.append(ind._mutate_insert_inner_node) if leaf_config_dict is not None: starting_ops.append(ind._mutate_insert_leaf) - + if len(starting_ops) > 0: if n_nodes > 0: - for _ in range(np.random.randint(0,min(n_nodes,3))): - func = np.random.choice(starting_ops) - func() + for _ in range(rng.integers(0,min(n_nodes,3))): + func = rng.choice(starting_ops) + func(rng_=rng) + - yield ind - + class BaggingCompositeGraphSklearn(): def __init__(self) -> None: @@ -72,4 +72,3 @@ def __init__(self) -> None: class BoostingCompositeGraphSklearn(): def __init__(self) -> None: pass - diff --git a/tpot2/individual_representations/individual.py b/tpot2/individual_representations/individual.py index 2dfdfc14..be61fdcb 100644 --- a/tpot2/individual_representations/individual.py +++ b/tpot2/individual_representations/individual.py @@ -1,7 +1,6 @@ from abc import abstractmethod import types import numpy as np -import random import copy import copy import typing @@ -11,32 +10,32 @@ class BaseIndividual: def __init__(self) -> None: - - - self.mutation_list = [] + self.mutation_list = [] self.crossover_list = [] - - def mutate(self,): + def mutate(self, rng_=None): + rng = np.random.default_rng(rng_) mutation_list_copy = self.mutation_list.copy() - random.shuffle(mutation_list_copy) + rng.shuffle(mutation_list_copy) for func in mutation_list_copy: if func(): return True return False - def crossover(self, ind2): + def crossover(self, ind2, rng_=None): + rng = np.random.default_rng(rng_) crossover_list_copy = self.crossover_list.copy() - random.shuffle(crossover_list_copy) + rng.shuffle(crossover_list_copy) for func in crossover_list_copy: if func(ind2): return True return False # a guided change of an individual when given an objective function - def optimize(self, objective_function, steps=5): + def optimize(self, objective_function, rng_=None , steps=5): + rng = np.random.default_rng(rng_) for _ in range(steps): - self.mutate() + self.mutate(rng_=rng) #Return a hashable unique to this individual setup #For use when evaluating whether or not an individual is 'the same' and another individual diff --git a/tpot2/individual_representations/subset_selector/subsetselector.py b/tpot2/individual_representations/subset_selector/subsetselector.py index f3beccf4..5dc1d8af 100644 --- a/tpot2/individual_representations/subset_selector/subsetselector.py +++ b/tpot2/individual_representations/subset_selector/subsetselector.py @@ -1,15 +1,18 @@ from numpy import iterable import tpot2 -import random +import numpy as np from .. import BaseIndividual class SubsetSelector(BaseIndividual): def __init__( self, values, + rng_=None, initial_set = None, k=1, #step size for shuffling ): + rng = np.random.default_rng(rng_) + if isinstance(values, int): self.values = set(range(0,values)) else: @@ -17,7 +20,7 @@ def __init__( self, if initial_set is None: - self.subsets = set(random.choices(values, k=k)) + self.subsets = set(rng.choices(values, k=k)) else: self.subsets = set(initial_set) @@ -25,20 +28,23 @@ def __init__( self, self.mutation_list = [self._mutate_add, self._mutate_remove] self.crossover_list = [self._crossover_swap] - - def _mutate_add(self,): + + def _mutate_add(self, rng_=None): + rng = np.random.default_rng(rng_) not_included = list(self.values.difference(self.subsets)) if len(not_included) > 1: - self.subsets.update(random.sample(not_included, k=min(self.k, len(not_included)))) + self.subsets.update(rng.choice(not_included, k=min(self.k, len(not_included)))) return True else: return False - def _mutate_remove(self,): + def _mutate_remove(self, rng_=None): + rng = np.random.default_rng(rng_) if len(self.subsets) > 1: - self.subsets = self.subsets - set(random.sample(list(self.subsets), k=min(self.k, len(self.subsets)-1) )) + self.subsets = self.subsets - set(rng.choice(list(self.subsets), k=min(self.k, len(self.subsets)-1) )) - def _crossover_swap(self, ss2): + def _crossover_swap(self, ss2, rng_=None): + rng = np.random.default_rng(rng_) diffs = self.subsets.symmetric_difference(ss2.subsets) if len(diffs) == 0: @@ -46,6 +52,6 @@ def _crossover_swap(self, ss2): for v in diffs: self.subsets.discard(v) ss2.subsets.discard(v) - random.choice([self.subsets, ss2.subsets]).add(v) - + rng.choice([self.subsets, ss2.subsets]).add(v) + return True diff --git a/tpot2/population.py b/tpot2/population.py index f32ad5c4..e8bf96dc 100644 --- a/tpot2/population.py +++ b/tpot2/population.py @@ -12,43 +12,47 @@ import pickle import dask -def mutate(individual): +def mutate(individual, rng_=None): + rng = np.random.default_rng(rng_) if isinstance(individual, collections.abc.Iterable): for ind in individual: - ind.mutate() + ind.mutate(rng_=rng) else: - individual.mutate() + individual.mutate(rng_=rng) return individual -def crossover(parents): - parents[0].crossover(parents[1]) +def crossover(parents, rng_=None): + rng = np.random.default_rng(rng_) + parents[0].crossover(parents[1], rng_=rng) return parents[0] -def mutate_and_crossover(parents): - parents[0].crossover(parents[1]) - parents[0].mutate() - parents[1].mutate() +def mutate_and_crossover(parents, rng_=None): + rng = np.random.default_rng(rng_) + parents[0].crossover(parents[1], rng_=rng) + parents[0].mutate(rng_=rng) + parents[1].mutate(rng_=rng) return parents -def crossover_and_mutate(parents): +def crossover_and_mutate(parents, rng_=None): + rng = np.random.default_rng(rng_) for p in parents: - p.mutate() - parents[0].crossover(parents[1]) + p.mutate(rng_=rng) + parents[0].crossover(parents[1], rng_=rng) return parents[0] -built_in_var_ops_dict = {"mutate":mutate, - "crossover":crossover, - "mutate_then_crossover":mutate_and_crossover, +built_in_var_ops_dict = {"mutate":mutate, + "crossover":crossover, + "mutate_then_crossover":mutate_and_crossover, "crossover_then_mutate":crossover_and_mutate} - + class Population(): ''' Primary usage is to keep track of evaluated individuals - + Parameters ---------- initial_population : {list of BaseIndividuals}, default=None @@ -59,7 +63,7 @@ class Population(): callback : {function}, default=None NOT YET IMPLEMENTED A function to call after each generation. The function should take a Population object as its only argument. - + Attributes ---------- population : {list of BaseIndividuals} @@ -75,7 +79,7 @@ def __init__( self, ) -> None: if column_names is not None: - + column_names = column_names+["Parents", "Variation_Function"] else: column_names = ["Parents", "Variation_Function"] @@ -86,21 +90,22 @@ def __init__( self, self.callback=callback self.population = [] - def survival_select(self, selector, weights, columns_names, n_survivors, inplace=True): + def survival_select(self, selector, weights, columns_names, n_survivors, rng_=None, inplace=True): + rng = np.random.default_rng(rng_) weighted_scores = self.get_column(self.population, column_names=columns_names) * weights - new_population_index = np.ravel(selector(weighted_scores, k=n_survivors)) #TODO make it clear that we are concatenating scores... + new_population_index = np.ravel(selector(weighted_scores, k=n_survivors, rng_=rng)) #TODO make it clear that we are concatenating scores... new_population = np.array(self.population)[new_population_index] if inplace: - self.set_population(new_population) + self.set_population(new_population, rng_=rng) return new_population - def parent_select(self, selector, weights, columns_names, k, n_parents): - + def parent_select(self, selector, weights, columns_names, k, n_parents, rng_=None): + rng = np.random.default_rng(rng_) weighted_scores = self.get_column(self.population, column_names=columns_names) * weights - parents_index = selector(weighted_scores, k=k, n_parents=n_parents) + parents_index = selector(weighted_scores, k=k, n_parents=n_parents, rng_=rng) parents = np.array(self.population)[parents_index] return parents - + #remove individuals that either do not have a column_name value or a nan in that value #TODO take into account when the value is not a list/tuple? @@ -108,12 +113,12 @@ def parent_select(self, selector, weights, columns_names, k, n_parents): def remove_invalid_from_population(self, column_names, invalid_value = "INVALID"): ''' Remove individuals from the live population if either do not have a value in the column_name column or if the value contains np.nan. - + Parameters ---------- column_name : {str} The name of the column to check for np.nan values. - + Returns ------- None @@ -124,17 +129,17 @@ def remove_invalid_from_population(self, column_names, invalid_value = "INVALID" is_valid = lambda ind: ind.unique_id() not in self.evaluated_individuals.index or invalid_value not in self.evaluated_individuals.loc[ind.unique_id(),column_names].to_list() self.population = [ind for ind in self.population if is_valid(ind)] - - # takes the list of individuals and adds it to the live population list. + + # takes the list of individuals and adds it to the live population list. # if keep_repeats is False, repeated individuals are not added to the population - # returns a list of individuals added to the live population + # returns a list of individuals added to the live population #TODO make keep repeats allow for previously evaluated individuals, #but make sure that the live population only includes one of each, no repeats - def add_to_population(self, individuals: typing.List[BaseIndividual], keep_repeats=False, mutate_until_unique=True): + def add_to_population(self, individuals: typing.List[BaseIndividual], rng_=None, keep_repeats=False, mutate_until_unique=True): ''' Add individuals to the live population. Add individuals to the evaluated_individuals if they are not already there. - + Parameters: ----------- individuals : {list of BaseIndividuals} @@ -143,6 +148,9 @@ def add_to_population(self, individuals: typing.List[BaseIndividual], keep_repea If True, allow the population to have repeated individuals. If False, only add individuals that have not yet been added to geneology. ''' + + rng = np.random.default_rng(rng_) + if not isinstance(individuals, collections.abc.Iterable): individuals = [individuals] @@ -164,7 +172,7 @@ def add_to_population(self, individuals: typing.List[BaseIndividual], keep_repea elif mutate_until_unique: #If its old and we don't want repeats, we can optionally mutate it until it is unique for _ in range(20): individual = copy.deepcopy(individual) - individual.mutate() + individual.mutate(rng_=rng) key = individual.unique_id() if key not in self.evaluated_individuals.index: self.evaluated_individuals.loc[key] = np.nan @@ -172,7 +180,7 @@ def add_to_population(self, individuals: typing.List[BaseIndividual], keep_repea self.population.append(individual) new_individuals.append(individual) break - + return new_individuals @@ -195,7 +203,7 @@ def update_column(self, individual, column_names, data): self.evaluated_individuals.loc[key,column_names] = data - + def get_column(self, individual, column_names=None, to_numpy=True): ''' Update the column_name column in the evaluated_individuals with the data. @@ -229,13 +237,13 @@ def get_column(self, individual, column_names=None, to_numpy=True): def get_unevaluated_individuals(self, column_names, individual_list=None): if individual_list is None: individual_list = self.population - + if self.use_unique_id: unevaluated_filter = lambda individual: individual.unique_id() not in self.evaluated_individuals.index or any(self.evaluated_individuals.loc[individual.unique_id(), column_names].isna()) else: unevaluated_filter = lambda individual: individual not in self.evaluated_individuals.index or any(self.evaluated_individuals.loc[individual.unique_id(), column_names].isna()) - - return [individual for individual in individual_list if unevaluated_filter(individual)] + + return [individual for individual in individual_list if unevaluated_filter(individual)] # def get_valid_evaluated_individuals_df(self, column_names_to_check, invalid_values=["TIMEOUT","INVALID"]): # ''' @@ -244,18 +252,19 @@ def get_unevaluated_individuals(self, column_names, individual_list=None): # return self.evaluated_individuals[~self.evaluated_individuals[column_names_to_check].isin(invalid_values).any(axis=1)] #the live population empied and is set to new_population - def set_population(self, new_population, keep_repeats=True): + def set_population(self, new_population, rng_=None, keep_repeats=True): ''' sets population to new population for selection? ''' + rng = np.random.default_rng(rng_) self.population = [] - self.add_to_population(new_population, keep_repeats=keep_repeats) + self.add_to_population(new_population, rng_=rng, keep_repeats=keep_repeats) - #TODO should we just generate one offspring per crossover? - def create_offspring(self, parents_list, var_op_list, add_to_population=True, keep_repeats=False, mutate_until_unique=True, n_jobs=1): + #TODO should we just generate one offspring per crossover? + def create_offspring(self, parents_list, var_op_list, rng_=None, add_to_population=True, keep_repeats=False, mutate_until_unique=True, n_jobs=1): ''' - parents_list: a list of lists of parents. + parents_list: a list of lists of parents. var_op_list: a list of var_ops to apply to each list of parents. Should be the same length as parents_list. for example: @@ -265,53 +274,55 @@ def create_offspring(self, parents_list, var_op_list, add_to_population=True, ke This will apply crossover to parent1 and parent2 and mutate to parent3. Creates offspring from parents using the var_op_list. - If string, will use a built in method + If string, will use a built in method - "crossover" : crossover - "mutate" : mutate - "mutate_and_crossover" : mutate_and_crossover - "cross_and_mutate" : cross_and_mutate ''' + rng = np.random.default_rng(rng_) new_offspring = [] - all_offspring = parallel_create_offspring(parents_list, var_op_list, n_jobs=n_jobs) + all_offspring = parallel_create_offspring(parents_list, var_op_list, rng_=rng, n_jobs=n_jobs) for parents, offspring, var_op in zip(parents_list, all_offspring, var_op_list): - + # if var_op in built_in_var_ops_dict: # var_op = built_in_var_ops_dict[var_op] # offspring = copy.deepcopy(parents) # offspring = var_op(offspring) # if isinstance(offspring, collections.abc.Iterable): - # offspring = offspring[0] + # offspring = offspring[0] if add_to_population: - added = self.add_to_population(offspring, keep_repeats=keep_repeats, mutate_until_unique=mutate_until_unique) + added = self.add_to_population(offspring, rng_=rng, keep_repeats=keep_repeats, mutate_until_unique=mutate_until_unique) if len(added) > 0: for new_child in added: parent_keys = [parent.unique_id() for parent in parents] if not pd.api.types.is_object_dtype(self.evaluated_individuals["Parents"]): #TODO Is there a cleaner way of doing this? Not required for some python environments? self.evaluated_individuals["Parents"] = self.evaluated_individuals["Parents"].astype('object') self.evaluated_individuals.at[new_child.unique_id(),"Parents"] = tuple(parent_keys) - + #if var_op is a function if hasattr(var_op, '__call__'): self.evaluated_individuals.at[new_child.unique_id(),"Variation_Function"] = var_op.__name__ else: self.evaluated_individuals.at[new_child.unique_id(),"Variation_Function"] = var_op - - + + new_offspring.append(new_child) else: new_offspring.append(offspring) - - + + return new_offspring - #TODO should we just generate one offspring per crossover? - def create_offspring2(self, parents_list, var_op_list, mutation_functions,mutation_function_weights, crossover_functions,crossover_function_weights, add_to_population=True, keep_repeats=False, mutate_until_unique=True): + #TODO should we just generate one offspring per crossover? + def create_offspring2(self, parents_list, var_op_list, mutation_functions,mutation_function_weights, crossover_functions,crossover_function_weights, rng_=None, add_to_population=True, keep_repeats=False, mutate_until_unique=True): + rng = np.random.default_rng(rng_) new_offspring = [] all_offspring = [] @@ -320,62 +331,62 @@ def create_offspring2(self, parents_list, var_op_list, mutation_functions,mutati for parents, var_op in zip(parents_list,var_op_list): #TODO put this loop in population class if var_op == "mutation": - mutation_op = np.random.choice(mutation_functions, p=mutation_function_weights) - all_offspring.append(copy_and_mutate(parents, mutation_op)) + mutation_op = rng.choice(mutation_functions, p=mutation_function_weights) + all_offspring.append(copy_and_mutate(parents, mutation_op, rng_=rng)) chosen_ops.append(mutation_op.__name__) - - + + elif var_op == "crossover": - crossover_op = np.random.choice(crossover_functions, p=crossover_function_weights) - all_offspring.append(copy_and_crossover(parents, crossover_op)) + crossover_op = rng.choice(crossover_functions, p=crossover_function_weights) + all_offspring.append(copy_and_crossover(parents, crossover_op, rng_=rng)) chosen_ops.append(crossover_op.__name__) elif var_op == "mutate_then_crossover": - mutation_op1 = np.random.choice(mutation_functions, p=mutation_function_weights) - mutation_op2 = np.random.choice(mutation_functions, p=mutation_function_weights) - crossover_op = np.random.choice(crossover_functions, p=crossover_function_weights) - p1 = copy_and_mutate(parents[0], mutation_op1) - p2 = copy_and_mutate(parents[1], mutation_op2) - crossover_op(p1,p2) + mutation_op1 = rng.choice(mutation_functions, p=mutation_function_weights) + mutation_op2 = rng.choice(mutation_functions, p=mutation_function_weights) + crossover_op = rng.choice(crossover_functions, p=crossover_function_weights) + p1 = copy_and_mutate(parents[0], mutation_op1, rng_=rng) + p2 = copy_and_mutate(parents[1], mutation_op2, rng_=rng) + crossover_op(p1,p2,rng_=rng) all_offspring.append(p1) chosen_ops.append(f"{mutation_op1.__name__} , {mutation_op2.__name__} , {crossover_op.__name__}") elif var_op == "crossover_then_mutate": - crossover_op = np.random.choice(crossover_functions, p=crossover_function_weights) - child = copy_and_crossover(parents, crossover_op) - mutation_op = np.random.choice(mutation_functions, p=mutation_function_weights) - mutation_op(child) + crossover_op = rng.choice(crossover_functions, p=crossover_function_weights) + child = copy_and_crossover(parents, crossover_op, rng_=rng) + mutation_op = rng.choice(mutation_functions, p=mutation_function_weights) + mutation_op(child, rng_=rng) all_offspring.append(child) chosen_ops.append(f"{crossover_op.__name__} , {mutation_op.__name__}") for parents, offspring, var_op in zip(parents_list, all_offspring, chosen_ops): - + # if var_op in built_in_var_ops_dict: # var_op = built_in_var_ops_dict[var_op] # offspring = copy.deepcopy(parents) # offspring = var_op(offspring) # if isinstance(offspring, collections.abc.Iterable): - # offspring = offspring[0] + # offspring = offspring[0] if add_to_population: - added = self.add_to_population(offspring, keep_repeats=keep_repeats, mutate_until_unique=mutate_until_unique) + added = self.add_to_population(offspring, rng_=rng, keep_repeats=keep_repeats, mutate_until_unique=mutate_until_unique) if len(added) > 0: for new_child in added: parent_keys = [parent.unique_id() for parent in parents] if not pd.api.types.is_object_dtype(self.evaluated_individuals["Parents"]): #TODO Is there a cleaner way of doing this? Not required for some python environments? self.evaluated_individuals["Parents"] = self.evaluated_individuals["Parents"].astype('object') self.evaluated_individuals.at[new_child.unique_id(),"Parents"] = tuple(parent_keys) - + self.evaluated_individuals.at[new_child.unique_id(),"Variation_Function"] = var_op - - + + new_offspring.append(new_child) else: new_offspring.append(offspring) - - + + return new_offspring @@ -384,54 +395,58 @@ def create_offspring2(self, parents_list, var_op_list, mutation_functions,mutati def get_id(individual): return individual.unique_id() -def parallel_create_offspring(parents_list, var_op_list, n_jobs=1): +def parallel_create_offspring(parents_list, var_op_list, rng_=None, n_jobs=1): + rng = np.random.default_rng(rng_) if n_jobs == 1: - return nonparallel_create_offpring(parents_list, var_op_list) + return nonparallel_create_offpring(parents_list, var_op_list, rng_=rng) else: delayed_offspring = [] for parents, var_op in zip(parents_list,var_op_list): #TODO put this loop in population class if var_op in built_in_var_ops_dict: var_op = built_in_var_ops_dict[var_op] - delayed_offspring.append(dask.delayed(copy_and_change)(parents, var_op)) + delayed_offspring.append(dask.delayed(copy_and_change)(parents, var_op, rng_=rng)) offspring = dask.compute(*delayed_offspring, num_workers=n_jobs, threads_per_worker=1) return offspring -def nonparallel_create_offpring(parents_list, var_op_list, n_jobs=1): +def nonparallel_create_offpring(parents_list, var_op_list, rng_=None, n_jobs=1): + rng = np.random.default_rng(rng_) offspring = [] for parents, var_op in zip(parents_list,var_op_list): #TODO put this loop in population class if var_op in built_in_var_ops_dict: var_op = built_in_var_ops_dict[var_op] - offspring.append(copy_and_change(parents, var_op)) + offspring.append(copy_and_change(parents, var_op, rng_=rng)) return offspring -def copy_and_change(parents, var_op): +def copy_and_change(parents, var_op, rng_=None): + rng = np.random.default_rng(rng_) offspring = copy.deepcopy(parents) - offspring = var_op(offspring) + offspring = var_op(offspring, rng_=rng) if isinstance(offspring, collections.abc.Iterable): offspring = offspring[0] return offspring -def copy_and_mutate(parents, var_op): +def copy_and_mutate(parents, var_op, rng_=None): + rng = np.random.default_rng(rng_) offspring = copy.deepcopy(parents) - var_op(offspring) + var_op(offspring, rng_=rng) if isinstance(offspring, collections.abc.Iterable): offspring = offspring[0] return offspring -def copy_and_crossover(parents, var_op): +def copy_and_crossover(parents, var_op, rng_=None): + rng = np.random.default_rng(rng_) offspring = copy.deepcopy(parents) - var_op(offspring[0],offspring[1]) + var_op(offspring[0],offspring[1], rng_=rng) return offspring[0] def parallel_get_id(n_jobs, individual_list): id_list = Parallel(n_jobs=n_jobs)(delayed(get_id)(ind) for ind in individual_list) return id_list - diff --git a/tpot2/selectors/lexicase_selection.py b/tpot2/selectors/lexicase_selection.py index 54683a44..0afe1f34 100644 --- a/tpot2/selectors/lexicase_selection.py +++ b/tpot2/selectors/lexicase_selection.py @@ -1,24 +1,24 @@ import numpy as np -import random -def lexicase_selection(scores, k, n_parents=1,): - """Select the best individual according to Lexicase Selection, *k* times. +def lexicase_selection(scores, k, rng_=None, n_parents=1,): + """Select the best individual according to Lexicase Selection, *k* times. The returned list contains the indices of the chosen *individuals*. :param scores: The score matrix, where rows the individulas and the columns are the corresponds to scores on different objectives. :returns: A list of indices of selected individuals. This function uses the :func:`~random.choice` function from the python base :mod:`random` module. """ + rng = np.random.default_rng(rng_) chosen =[] for i in range(k*n_parents): candidates = list(range(len(scores))) cases = list(range(len(scores[0]))) - random.shuffle(cases) - + rng.shuffle(cases) + while len(cases) > 0 and len(candidates) > 1: best_val_for_case = max(scores[candidates,cases[0]]) candidates = [x for x in candidates if scores[x, cases[0]] == best_val_for_case] cases.pop(0) - chosen.append(random.choice(candidates)) + chosen.append(rng.choice(candidates)) return np.reshape(chosen, (k, n_parents)) \ No newline at end of file diff --git a/tpot2/selectors/max_weighted_average_selector.py b/tpot2/selectors/max_weighted_average_selector.py index edf3b06e..d142bafd 100644 --- a/tpot2/selectors/max_weighted_average_selector.py +++ b/tpot2/selectors/max_weighted_average_selector.py @@ -1,7 +1,6 @@ import numpy as np -import random -def max_weighted_average_selector(scores,k, n_parents=1,): +def max_weighted_average_selector(scores,k, rng_=None, n_parents=1,): ave_scores = [np.nanmean(s ) for s in scores ] #TODO make this more efficient chosen = np.argsort(ave_scores)[::-1][0:k] #TODO check this behavior with nans return np.reshape(chosen, (k, n_parents)) \ No newline at end of file diff --git a/tpot2/selectors/nsgaii.py b/tpot2/selectors/nsgaii.py index 0090407b..bb7bf76d 100644 --- a/tpot2/selectors/nsgaii.py +++ b/tpot2/selectors/nsgaii.py @@ -1,12 +1,11 @@ import numpy as np -import random # Deb, Pratab, Agarwal, and Meyarivan, “A fast elitist non-dominated sorting genetic algorithm for multi-objective optimization: NSGA-II”, 2002. # chatgpt def nondominated_sorting(matrix): """ - Returns the indexes of the matrix + Returns the indexes of the matrix bigger is better """ # Initialize the front list and the rank list @@ -20,7 +19,7 @@ def nondominated_sorting(matrix): # Initialize the list of points that dominate the current point dominating = [0 for _ in range(len(matrix))] #ni the number of solutions that denominate solution i - + # Iterate over all points for p, p_scores in enumerate(matrix): # Iterate over all other points @@ -31,7 +30,7 @@ def nondominated_sorting(matrix): # If the current point is dominated by the other point, add it to the list of dominated points elif dominates(q_scores, p_scores): dominating[p] += 1 - + if dominating[p] == 0: fronts[0].add(p) @@ -65,16 +64,16 @@ def crowding_distance(matrix): matrix = np.array(matrix) # Initialize the crowding distance for each point to zero crowding_distances = [0 for _ in range(len(matrix))] - + # Iterate over each objective for objective_i in range(matrix.shape[1]): # Sort the points according to the current objective sorted_i = matrix[:, objective_i].argsort() - + # Set the crowding distance of the first and last points to infinity crowding_distances[sorted_i[0]] = float("inf") crowding_distances[sorted_i[-1]] = float("inf") - + if matrix[sorted_i[0]][objective_i] == matrix[sorted_i[-1]][objective_i]: # https://github.com/DEAP/deap/blob/f2a570567fa3dce156d7cfb0c50bc72f133258a1/deap/tools/emo.py#L135 continue @@ -88,7 +87,7 @@ def crowding_distance(matrix): -def survival_select_NSGA2(scores, k,): +def survival_select_NSGA2(scores, k, rng_=None): pareto_fronts = nondominated_sorting(scores) @@ -109,5 +108,5 @@ def survival_select_NSGA2(scores, k,): chosen.extend(sorted_indeces[0:(k-len(chosen))]) current_front_number += 1 - + return chosen \ No newline at end of file diff --git a/tpot2/selectors/random_selector.py b/tpot2/selectors/random_selector.py index 2a384c62..54b37978 100644 --- a/tpot2/selectors/random_selector.py +++ b/tpot2/selectors/random_selector.py @@ -1,6 +1,6 @@ import numpy as np -import random -def random_selector(scores, k, n_parents=1,): - chosen = random.choices(list(range(0,len(scores))), k=k*n_parents) +def random_selector(scores, k, rng_=None, n_parents=1, ): + rng = np.random.default_rng(rng_) + chosen = rng.choice(list(range(0,len(scores))), size=k*n_parents) return np.reshape(chosen, (k, n_parents)) \ No newline at end of file diff --git a/tpot2/selectors/tournament_selection.py b/tpot2/selectors/tournament_selection.py index a2bbf950..a715a9dd 100644 --- a/tpot2/selectors/tournament_selection.py +++ b/tpot2/selectors/tournament_selection.py @@ -1,7 +1,6 @@ import numpy as np -import random -def tournament_selection(scores, k, n_parents=1, tournament_size=2, score_index=0): +def tournament_selection(scores, k, rng_=None, n_parents=1, tournament_size=2, score_index=0): """Select the best individual among *tournsize* randomly chosen individuals, *k* times. The returned list contains the indices of the chosen *individuals*. :param scores: The score matrix, where rows the individulas and the columns are the corresponds to scores on different objectives. @@ -13,6 +12,8 @@ def tournament_selection(scores, k, n_parents=1, tournament_size=2, score_index= :mod:`random` module. """ + rng = np.random.default_rng(rng_) + if isinstance(score_index,int): key=lambda x:x[1][score_index] elif score_index == "average": @@ -20,8 +21,8 @@ def tournament_selection(scores, k, n_parents=1, tournament_size=2, score_index= chosen = [] for i in range(k*n_parents): - aspirants_idx =[random.randrange(len(scores)) for i in range(tournament_size)] + aspirants_idx =[rng.choice(len(scores)) for i in range(tournament_size)] aspirants = list(zip(aspirants_idx, scores[aspirants_idx])) # Zip indices and elements together chosen.append(max(aspirants, key=key)[0]) # Retrun the index of the maximum element - + return np.reshape(chosen, (k, n_parents)) \ No newline at end of file diff --git a/tpot2/selectors/tournament_selection_dominated.py b/tpot2/selectors/tournament_selection_dominated.py index be485dc5..74556894 100644 --- a/tpot2/selectors/tournament_selection_dominated.py +++ b/tpot2/selectors/tournament_selection_dominated.py @@ -1,10 +1,9 @@ import numpy as np -import random from.nsgaii import nondominated_sorting, crowding_distance, dominates #based on deap -def tournament_selection_dominated(scores, k, n_parents=2): +def tournament_selection_dominated(scores, k, rng_=None, n_parents=2): """Select the best individual among *tournsize* randomly chosen individuals, *k* times. The returned list contains the indices of the chosen *individuals*. :param scores: The score matrix, where rows the individulas and the columns are the corresponds to scores on different objectives. @@ -15,6 +14,8 @@ def tournament_selection_dominated(scores, k, n_parents=2): This function uses the :func:`~random.choice` function from the python base :mod:`random` module. """ + + rng = np.random.default_rng(rng_) pareto_fronts = nondominated_sorting(scores) # chosen = list(itertools.chain.from_iterable(fronts)) @@ -37,26 +38,20 @@ def tournament_selection_dominated(scores, k, n_parents=2): chosen = [] for i in range(k*n_parents): - asp1 = random.randrange(len(scores)) - asp2 = random.randrange(len(scores)) + asp1 = rng.choice(len(scores)) + asp2 = rng.choice(len(scores)) if dominates(scores[asp1], scores[asp2]): chosen.append(asp1) elif dominates(scores[asp2], scores[asp1]): chosen.append(asp2) - + elif crowding_dict[asp1] > crowding_dict[asp2]: chosen.append(asp1) elif crowding_dict[asp1] < crowding_dict[asp2]: chosen.append(asp2) else: - chosen.append(random.choice([asp1,asp2])) - - return np.reshape(chosen, (k, n_parents)) - - - - - + chosen.append(rng.choice([asp1,asp2])) + return np.reshape(chosen, (k, n_parents)) diff --git a/tpot2/tpot_estimator/estimator.py b/tpot2/tpot_estimator/estimator.py index 8b9af74d..c534a7c1 100644 --- a/tpot2/tpot_estimator/estimator.py +++ b/tpot2/tpot_estimator/estimator.py @@ -5,7 +5,7 @@ import tpot2.config from sklearn.utils.validation import check_is_fitted from tpot2.selectors import survival_select_NSGA2, tournament_selection_dominated -from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import LabelEncoder import pandas as pd from sklearn.model_selection import train_test_split @@ -29,7 +29,7 @@ def set_dask_settings(): #TODO inherit from _BaseComposition? class TPOTEstimator(BaseEstimator): - def __init__(self, scorers, + def __init__(self, scorers, scorers_weights, classification, cv = 5, @@ -41,11 +41,11 @@ def __init__(self, scorers, hyperparameter_probability = 1, hyper_node_probability = 0, hyperparameter_alpha = 1, - max_size = np.inf, + max_size = np.inf, linear_pipeline = False, root_config_dict= 'Auto', inner_config_dict=["selectors", "transformers"], - leaf_config_dict= None, + leaf_config_dict= None, cross_val_predict_cv = 0, categorical_features = None, subsets = None, @@ -53,25 +53,25 @@ def __init__(self, scorers, preprocessing = False, population_size = 50, initial_population_size = None, - population_scaling = .5, - generations_until_end_population = 1, + population_scaling = .5, + generations_until_end_population = 1, generations = None, - max_time_seconds=3600, - max_eval_time_seconds=60*10, + max_time_seconds=3600, + max_eval_time_seconds=60*10, validation_strategy = "none", validation_fraction = .2, disable_label_encoder = False, - - #early stopping parameters + + #early stopping parameters early_stop = None, scorers_early_stop_tol = 0.001, other_objectives_early_stop_tol =None, - threshold_evaluation_early_stop = None, + threshold_evaluation_early_stop = None, threshold_evaluation_scaling = .5, - selection_evaluation_early_stop = None, - selection_evaluation_scaling = .5, + selection_evaluation_early_stop = None, + selection_evaluation_scaling = .5, min_history_threshold = 20, - + #evolver parameters survival_percentage = 1, crossover_probability=.2, @@ -80,77 +80,80 @@ def __init__(self, scorers, crossover_then_mutate_probability=.05, survival_selector = survival_select_NSGA2, parent_selector = tournament_selection_dominated, - + #budget parameters budget_range = None, budget_scaling = .5, - generations_until_end_budget = 1, + generations_until_end_budget = 1, stepwise_steps = 5, - + optuna_optimize_pareto_front = False, optuna_optimize_pareto_front_trials = 100, optuna_optimize_pareto_front_timeout = 60*10, optuna_storage = "sqlite:///optuna.db", - + #dask parameters n_jobs=1, memory_limit = "4GB", client = None, processes = True, - + #debugging and logging parameters warm_start = False, subset_column = None, - periodic_checkpoint_folder = None, + periodic_checkpoint_folder = None, callback = None, - + verbose = 0, scatter = True, + # random seed for random number generator (rng) + random_state = None, + ): - + ''' An sklearn baseestimator that uses genetic programming to optimize a pipeline. - + Parameters ---------- - + scorers : (list, scorer) - A scorer or list of scorers to be used in the cross-validation process. + A scorer or list of scorers to be used in the cross-validation process. see https://scikit-learn.org/stable/modules/model_evaluation.html - + scorers_weights : list A list of weights to be applied to the scorers during the optimization process. - + classification : bool If True, the problem is treated as a classification problem. If False, the problem is treated as a regression problem. Used to determine the CV strategy. - + cv : int, cross-validator - (int): Number of folds to use in the cross-validation process. By uses the sklearn.model_selection.KFold cross-validator for regression and StratifiedKFold for classification. In both cases, shuffled is set to True. - (sklearn.model_selection.BaseCrossValidator): A cross-validator to use in the cross-validation process. - max_depth (int): The maximum depth from any node to the root of the pipelines to be generated. - + other_objective_functions : list, default=[] A list of other objective functions to apply to the pipeline. The function takes a single parameter for the graphpipeline estimator and returns either a single score or a list of scores. - + other_objective_functions_weights : list, default=[] A list of weights to be applied to the other objective functions. - + objective_function_names : list, default=None A list of names to be applied to the objective functions. If None, will use the names of the objective functions. - + bigger_is_better : bool, default=True If True, the objective function is maximized. If False, the objective function is minimized. Use negative weights to reverse the direction. - + max_size : int, default=np.inf The maximum number of nodes of the pipelines to be generated. - + linear_pipeline : bool, default=False If True, the pipelines generated will be linear. If False, the pipelines generated will be directed acyclic graphs. - + root_config_dict : dict, default='auto' The configuration dictionary to use for the root node of the model. If 'auto', will use "classifiers" if classification=True, else "regressors". @@ -168,7 +171,7 @@ def __init__(self, scorers, - 'genetic encoders' : Includes Genetic Encoder methods as used in AutoQTL. - 'FeatureEncodingFrequencySelector': Includes FeatureEncodingFrequencySelector method as used in AutoQTL. - list : a list of strings out of the above options to include the corresponding methods in the configuration dictionary. - + inner_config_dict : dict, default=["selectors", "transformers"] The configuration dictionary to use for the inner nodes of the model generation. Default ["selectors", "transformers"] @@ -187,10 +190,10 @@ def __init__(self, scorers, - 'FeatureEncodingFrequencySelector': Includes FeatureEncodingFrequencySelector method as used in AutoQTL. - list : a list of strings out of the above options to include the corresponding methods in the configuration dictionary. - None : If None and max_depth>1, the root_config_dict will be used for the inner nodes as well. - - leaf_config_dict : dict, default=None + + leaf_config_dict : dict, default=None The configuration dictionary to use for the leaf node of the model. If set, leaf nodes must be from this dictionary. - Otherwise leaf nodes will be generated from the root_config_dict. + Otherwise leaf nodes will be generated from the root_config_dict. Default None - 'selectors' : A selection of sklearn Selector methods. - 'classifiers' : A selection of sklearn Classifier methods. @@ -207,14 +210,14 @@ def __init__(self, scorers, - 'FeatureEncodingFrequencySelector': Includes FeatureEncodingFrequencySelector method as used in AutoQTL. - list : a list of strings out of the above options to include the corresponding methods in the configuration dictionary. - None : If None, a leaf will not be required (i.e. the pipeline can be a single root node). Leaf nodes will be generated from the inner_config_dict. - + cross_val_predict_cv : int, default=0 Number of folds to use for the cross_val_predict function for inner classifiers and regressors. Estimators will still be fit on the full dataset, but the following node will get the outputs from cross_val_predict. - + - 0-1 : When set to 0 or 1, the cross_val_predict function will not be used. The next layer will get the outputs from fitting and transforming the full dataset. - - >=2 : When fitting pipelines with inner classifiers or regressors, they will still be fit on the full dataset. + - >=2 : When fitting pipelines with inner classifiers or regressors, they will still be fit on the full dataset. However, the output to the next node will come from cross_val_predict with the specified number of folds. - + categorical_features: list or None Categorical columns to inpute and/or one hot encode during the preprocessing step. Used only if preprocessing is not False. - None : If None, TPOT2 will automatically use object columns in pandas dataframes as objects for one hot encoding in preprocessing. @@ -222,7 +225,7 @@ def __init__(self, scorers, subsets : str or list, default=None Sets the subsets that the FeatureSetSeletor will select from if set as an option in one of the configuration dictionaries. - - str : If a string, it is assumed to be a path to a csv file with the subsets. + - str : If a string, it is assumed to be a path to a csv file with the subsets. The first column is assumed to be the name of the subset and the remaining columns are the features in the subset. - list or np.ndarray : If a list or np.ndarray, it is assumed to be a list of subsets. - None : If None, each column will be treated as a subset. One column will be selected per subset. @@ -245,178 +248,185 @@ def __init__(self, scorers, - None: TPOT does not use memory caching. - preprocessing : bool or BaseEstimator/Pipeline, + preprocessing : bool or BaseEstimator/Pipeline, EXPERIMENTAL A pipeline that will be used to preprocess the data before CV. - bool : If True, will use a default preprocessing pipeline. - Pipeline : If an instance of a pipeline is given, will use that pipeline as the preprocessing pipeline. - + population_size : int, default=50 Size of the population - + initial_population_size : int, default=None Size of the initial population. If None, population_size will be used. - + population_scaling : int, default=0.5 Scaling factor to use when determining how fast we move the threshold moves from the start to end percentile. - - generations_until_end_population : int, default=1 - Number of generations until the population size reaches population_size - + + generations_until_end_population : int, default=1 + Number of generations until the population size reaches population_size + generations : int, default=50 Number of generations to run - + max_time_seconds : float, default=float("inf") Maximum time to run the optimization. If none or inf, will run until the end of the generations. - + max_eval_time_seconds : float, default=60*5 Maximum time to evaluate a single individual. If none or inf, there will be no time limit per evaluation. - + validation_strategy : str, default='none' EXPERIMENTAL The validation strategy to use for selecting the final pipeline from the population. TPOT2 may overfit the cross validation score. A second validation set can be used to select the final pipeline. - 'auto' : Automatically determine the validation strategy based on the dataset shape. - - 'reshuffled' : Use the same data for cross validation and final validation, but with different splits for the folds. This is the default for small datasets. - - 'split' : Use a separate validation set for final validation. Data will be split according to validation_fraction. This is the default for medium datasets. + - 'reshuffled' : Use the same data for cross validation and final validation, but with different splits for the folds. This is the default for small datasets. + - 'split' : Use a separate validation set for final validation. Data will be split according to validation_fraction. This is the default for medium datasets. - 'none' : Do not use a separate validation set for final validation. Select based on the original cross-validation score. This is the default for large datasets. validation_fraction : float, default=0.2 EXPERIMENTAL The fraction of the dataset to use for the validation set when validation_strategy is 'split'. Must be between 0 and 1. - + disable_label_encoder : bool, default=False If True, TPOT will check if the target needs to be relabeled to be sequential ints from 0 to N. This is necessary for XGBoost compatibility. If the labels need to be encoded, TPOT2 will use sklearn.preprocessing.LabelEncoder to encode the labels. The encoder can be accessed via the self.label_encoder_ attribute. If False, no additional label encoders will be used. early_stop : int, default=None Number of generations without improvement before early stopping. All objectives must have converged within the tolerance for this to be triggered. - - scorers_early_stop_tol : + + scorers_early_stop_tol : -list of floats list of tolerances for each scorer. If the difference between the best score and the current score is less than the tolerance, the individual is considered to have converged If an index of the list is None, that item will not be used for early stopping - -int + -int If an int is given, it will be used as the tolerance for all objectives - - other_objectives_early_stop_tol : + + other_objectives_early_stop_tol : -list of floats list of tolerances for each of the other objective function. If the difference between the best score and the current score is less than the tolerance, the individual is considered to have converged If an index of the list is None, that item will not be used for early stopping - -int + -int If an int is given, it will be used as the tolerance for all objectives - + threshold_evaluation_early_stop : list [start, end], default=None starting and ending percentile to use as a threshold for the evaluation early stopping. Values between 0 and 100. - + threshold_evaluation_scaling : float [0,inf), default=0.5 A scaling factor to use when determining how fast we move the threshold moves from the start to end percentile. Must be greater than zero. Higher numbers will move the threshold to the end faster. - + selection_evaluation_early_stop : list, default=None A lower and upper percent of the population size to select each round of CV. Values between 0 and 1. - - selection_evaluation_scaling : float, default=0.5 + + selection_evaluation_scaling : float, default=0.5 A scaling factor to use when determining how fast we move the threshold moves from the start to end percentile. - Must be greater than zero. Higher numbers will move the threshold to the end faster. - + Must be greater than zero. Higher numbers will move the threshold to the end faster. + min_history_threshold : int, default=0 The minimum number of previous scores needed before using threshold early stopping. - + survival_percentage : float, default=1 - Percentage of the population size to utilize for mutation and crossover at the beginning of the generation. The rest are discarded. Individuals are selected with the selector passed into survival_selector. The value of this parameter must be between 0 and 1, inclusive. + Percentage of the population size to utilize for mutation and crossover at the beginning of the generation. The rest are discarded. Individuals are selected with the selector passed into survival_selector. The value of this parameter must be between 0 and 1, inclusive. For example, if the population size is 100 and the survival percentage is .5, 50 individuals will be selected with NSGA2 from the existing population. These will be used for mutation and crossover to generate the next 100 individuals for the next generation. The remainder are discarded from the live population. In the next generation, there will now be the 50 parents + the 100 individuals for a total of 150. Surivival percentage is based of the population size parameter and not the existing population size (current population size when using successive halving). Therefore, in the next generation we will still select 50 individuals from the currently existing 150. - + crossover_probability : float, default=.2 Probability of generating a new individual by crossover between two individuals. - + mutate_probability : float, default=.7 Probability of generating a new individual by crossover between one individuals. - + mutate_then_crossover_probability : float, default=.05 Probability of generating a new individual by mutating two individuals followed by crossover. - + crossover_then_mutate_probability : float, default=.05 Probability of generating a new individual by crossover between two individuals followed by a mutation of the resulting individual. - + survival_selector : function, default=survival_select_NSGA2 Function to use to select individuals for survival. Must take a matrix of scores and return selected indexes. Used to selected population_size * survival_percentage individuals at the start of each generation to use for mutation and crossover. - + parent_selector : function, default=parent_select_NSGA2 Function to use to select pairs parents for crossover and individuals for mutation. Must take a matrix of scores and return selected indexes. - + budget_range : list [start, end], default=None A starting and ending budget to use for the budget scaling. - + budget_scaling float : [0,1], default=0.5 A scaling factor to use when determining how fast we move the budget from the start to end budget. - + generations_until_end_budget : int, default=1 The number of generations to run before reaching the max budget. - + stepwise_steps : int, default=1 The number of staircase steps to take when scaling the budget and population size. - - + + n_jobs : int, default=1 Number of processes to run in parallel. - + memory_limit : str, default="4GB" Memory limit for each job. See Dask [LocalCluster documentation](https://distributed.dask.org/en/stable/api.html#distributed.Client) for more information. - + client : dask.distributed.Client, default=None - A dask client to use for parallelization. If not None, this will override the n_jobs and memory_limit parameters. If None, will create a new client with num_workers=n_jobs and memory_limit=memory_limit. - + A dask client to use for parallelization. If not None, this will override the n_jobs and memory_limit parameters. If None, will create a new client with num_workers=n_jobs and memory_limit=memory_limit. + processes : bool, default=True If True, will use multiprocessing to parallelize the optimization process. If False, will use threading. True seems to perform better. However, False is required for interactive debugging. - - + + warm_start : bool, default=False If True, will use the continue the evolutionary algorithm from the last generation of the previous run. - + subset_column : str or int, default=None EXPERIMENTAL The column to use for the subset selection. Must also pass in unique_subset_values to GraphIndividual to function. - + periodic_checkpoint_folder : str, default=None Folder to save the population to periodically. If None, no periodic saving will be done. If provided, training will resume from this checkpoint. - + callback : tpot2.CallBackInterface, default=None Callback object. Not implemented - - verbose : int, default=1 + + verbose : int, default=1 How much information to print during the optimization process. Higher values include the information from lower values. 0. nothing 1. progress bar - + 3. best individual 4. warnings >=5. full warnings trace 6. evaluations progress bar. (Temporary: This used to be 2. Currently, using evaluation progress bar may prevent some instances were we terminate a generation early due to it reaching max_time_seconds in the middle of a generation OR a pipeline failed to be terminated normally and we need to manually terminate it.) - - + + random_state : int, None, default=None + A seed for reproducability of experiments. This value will be passed to numpy.random.default_rng() to create an instnce of the genrator to pass to other classes + + - int + Will be used to create and lock in Generator instance with 'numpy.random.default_rng()' + - None + Will be used to create Generator for 'numpy.random.default_rng()' where a fresh, unpredictable entropy will be pulled from the OS + Attributes ---------- fitted_pipeline_ : GraphPipeline A fitted instance of the GraphPipeline that inherits from sklearn BaseEstimator. This is fitted on the full X, y passed to fit. - evaluated_individuals : A pandas data frame containing data for all evaluated individuals in the run. - Columns: + evaluated_individuals : A pandas data frame containing data for all evaluated individuals in the run. + Columns: - *objective functions : The first few columns correspond to the passed in scorers and objective functions - Parents : A tuple containing the indexes of the pipelines used to generate the pipeline of that row. If NaN, this pipeline was generated randomly in the initial population. - Variation_Function : Which variation function was used to mutate or crossover the parents. If NaN, this pipeline was generated randomly in the initial population. - Individual : The internal representation of the individual that is used during the evolutionary algorithm. This is not an sklearn BaseEstimator. - - Generation : The generation the pipeline first appeared. - - Pareto_Front : The nondominated front that this pipeline belongs to. 0 means that its scores is not strictly dominated by any other individual. - To save on computational time, the best frontier is updated iteratively each generation. + - Generation : The generation the pipeline first appeared. + - Pareto_Front : The nondominated front that this pipeline belongs to. 0 means that its scores is not strictly dominated by any other individual. + To save on computational time, the best frontier is updated iteratively each generation. The pipelines with the 0th pareto front do represent the exact best frontier. However, the pipelines with pareto front >= 1 are only in reference to the other pipelines in the final population. - All other pipelines are set to NaN. - - Instance : The unfitted GraphPipeline BaseEstimator. + All other pipelines are set to NaN. + - Instance : The unfitted GraphPipeline BaseEstimator. - *validation objective functions : Objective function scores evaluated on the validation set. - Validation_Pareto_Front : The full pareto front calculated on the validation set. This is calculated for all pipelines with Pareto_Front equal to 0. Unlike the Pareto_Front which only calculates the frontier and the final population, the Validation Pareto Front is calculated for all pipelines tested on the validation set. - + pareto_front : The same pandas dataframe as evaluated individuals, but containing only the frontier pareto front pipelines. ''' @@ -455,7 +465,7 @@ def __init__(self, scorers, self.early_stop = early_stop self.scorers_early_stop_tol = scorers_early_stop_tol self.other_objectives_early_stop_tol = other_objectives_early_stop_tol - self.max_time_seconds = max_time_seconds + self.max_time_seconds = max_time_seconds self.max_eval_time_seconds = max_eval_time_seconds self.n_jobs= n_jobs self.memory_limit = memory_limit @@ -491,6 +501,13 @@ def __init__(self, scorers, self.optuna_optimize_pareto_front_timeout = optuna_optimize_pareto_front_timeout self.optuna_storage = optuna_storage + # create random number generator based on rng_seed + self.rng = np.random.default_rng(random_state) + # save random state passed to us for other functions that use random_state + self.random_state = random_state + # set the numpy seed so anything using it will be consistent as well + np.random.seed(random_state) + #Initialize other used params @@ -506,22 +523,22 @@ def __init__(self, scorers, self._scorers = [self.scorers] else: self._scorers = self.scorers - + self._scorers = [sklearn.metrics.get_scorer(scoring) for scoring in self._scorers] self._scorers_early_stop_tol = self.scorers_early_stop_tol - + self._evolver = tpot2.evolvers.BaseEvolver - + self.objective_function_weights = [*scorers_weights, *other_objective_functions_weights] - + if self.objective_function_names is None: obj_names = [f.__name__ for f in other_objective_functions] else: obj_names = self.objective_function_names self.objective_names = [f._score_func.__name__ if hasattr(f,"_score_func") else f.__name__ for f in self._scorers] + obj_names - - + + if not isinstance(self.other_objectives_early_stop_tol, list): self._other_objectives_early_stop_tol = [self.other_objectives_early_stop_tol for _ in range(len(self.other_objective_functions))] else: @@ -533,7 +550,7 @@ def __init__(self, scorers, self._scorers_early_stop_tol = self._scorers_early_stop_tol self.early_stop_tol = [*self._scorers_early_stop_tol, *self._other_objectives_early_stop_tol] - + self._evolver_instance = None self.evaluated_individuals = None @@ -564,8 +581,8 @@ def fit(self, X, y): if self.classification and not self.disable_label_encoder and not check_if_y_is_encoded(y): warnings.warn("Labels are not encoded as ints from 0 to N. For compatibility with some classifiers such as sklearn, TPOT has encoded y with the sklearn LabelEncoder. When using pipelines outside the main TPOT estimator class, you can encode the labels with est.label_encoder_") - self.label_encoder_ = LabelEncoder() - y = self.label_encoder_.fit_transform(y) + self.label_encoder_ = LabelEncoder() + y = self.label_encoder_.fit_transform(y) self.evaluated_individuals = None #determine validation strategy @@ -584,9 +601,9 @@ def fit(self, X, y): if validation_strategy == 'split': if self.classification: - X, X_val, y, y_val = train_test_split(X, y, test_size=self.validation_fraction, stratify=y, random_state=42) + X, X_val, y, y_val = train_test_split(X, y, test_size=self.validation_fraction, stratify=y, random_state=self.random_state) else: - X, X_val, y, y_val = train_test_split(X, y, test_size=self.validation_fraction, random_state=42) + X, X_val, y, y_val = train_test_split(X, y, test_size=self.validation_fraction, random_state=self.random_state) X_original = X @@ -598,7 +615,7 @@ def fit(self, X, y): if self.classification: X, y = remove_underrepresented_classes(X, y, n_folds) - + if self.preprocessing: #X = pd.DataFrame(X) @@ -616,7 +633,7 @@ def fit(self, X, y): tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'), #impute remaining numeric columns tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, min_frequency=0.0001)) #one hot encode categorical columns else: #numpy array and no categorical columns specified, just do imputation - self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')) + self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')) else: @@ -626,6 +643,16 @@ def fit(self, X, y): #Set up the configuation dictionaries and the search spaces + #check if self.cv is a number + if isinstance(self.cv, int) or isinstance(self.cv, float): + if self.classification: + self.cv_gen = sklearn.model_selection.StratifiedKFold(n_splits=self.cv, shuffle=True, random_state=self.random_state) + else: + self.cv_gen = sklearn.model_selection.KFold(n_splits=self.cv, shuffle=True, random_state=self.random_state) + + else: + self.cv_gen = sklearn.model_selection.check_cv(self.cv, y, classifier=self.classification) + n_samples= int(math.floor(X.shape[0]/n_folds)) @@ -639,54 +666,42 @@ def fit(self, X, y): if self.root_config_dict == 'Auto': if self.classification: n_classes = len(np.unique(y)) - root_config_dict = get_configuration_dictionary("classifiers", n_samples, n_features, self.classification, subsets=self.subsets, feature_names=self.feature_names, n_classes=n_classes) + root_config_dict = get_configuration_dictionary("classifiers", n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names, n_classes=n_classes) else: - root_config_dict = get_configuration_dictionary("regressors", n_samples, n_features, self.classification,subsets=self.subsets, feature_names=self.feature_names) + root_config_dict = get_configuration_dictionary("regressors", n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names) else: - root_config_dict = get_configuration_dictionary(self.root_config_dict, n_samples, n_features, self.classification, subsets=self.subsets,feature_names=self.feature_names) - - inner_config_dict = get_configuration_dictionary(self.inner_config_dict, n_samples, n_features, self.classification,subsets=self.subsets, feature_names=self.feature_names) - leaf_config_dict = get_configuration_dictionary(self.leaf_config_dict, n_samples, n_features, self.classification, subsets=self.subsets, feature_names=self.feature_names) - - + root_config_dict = get_configuration_dictionary(self.root_config_dict, n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets,feature_names=self.feature_names) + inner_config_dict = get_configuration_dictionary(self.inner_config_dict, n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names) + leaf_config_dict = get_configuration_dictionary(self.leaf_config_dict, n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names) - #check if self.cv is a number - if isinstance(self.cv, int) or isinstance(self.cv, float): - if self.classification: - self.cv_gen = sklearn.model_selection.StratifiedKFold(n_splits=self.cv, shuffle=True, random_state=42) - else: - self.cv_gen = sklearn.model_selection.KFold(n_splits=self.cv, shuffle=True, random_state=42) - else: - self.cv_gen = sklearn.model_selection.check_cv(self.cv, y, classifier=self.classification) - - def objective_function(pipeline_individual, - X, + def objective_function(pipeline_individual, + X, y, is_classification=self.classification, - scorers= self._scorers, - cv=self.cv_gen, + scorers= self._scorers, + cv=self.cv_gen, other_objective_functions=self.other_objective_functions, - memory=self.memory, - cross_val_predict_cv=self.cross_val_predict_cv, - subset_column=self.subset_column, - **kwargs): + memory=self.memory, + cross_val_predict_cv=self.cross_val_predict_cv, + subset_column=self.subset_column, + **kwargs): return objective_function_generator( pipeline_individual, - X, - y, + X, + y, is_classification=is_classification, - scorers= scorers, - cv=cv, + scorers= scorers, + cv=cv, other_objective_functions=other_objective_functions, - memory=memory, - cross_val_predict_cv=cross_val_predict_cv, + memory=memory, + cross_val_predict_cv=cross_val_predict_cv, subset_column=subset_column, **kwargs, ) - self.individual_generator_instance = tpot2.individual_representations.graph_pipeline_individual.estimator_graph_individual_generator( + self.individual_generator_instance = tpot2.individual_representations.graph_pipeline_individual.estimator_graph_individual_generator( inner_config_dict=inner_config_dict, root_config_dict=root_config_dict, leaf_config_dict=leaf_config_dict, @@ -695,6 +710,7 @@ def objective_function(pipeline_individual, hyperparameter_probability=self.hyperparameter_probability, hyper_node_probability=self.hyper_node_probability, hyperparameter_alpha=self.hyperparameter_alpha, + rng_=self.rng, ) if self.threshold_evaluation_early_stop is not None or self.selection_evaluation_early_stop is not None: @@ -711,7 +727,7 @@ def objective_function(pipeline_individual, #If warm start and we have an evolver instance, use the existing one if not(self.warm_start and self._evolver_instance is not None): - self._evolver_instance = self._evolver( individual_generator=self.individual_generator_instance, + self._evolver_instance = self._evolver( individual_generator=self.individual_generator_instance, objective_functions= [objective_function], objective_function_weights = self.objective_function_weights, objective_names=self.objective_names, @@ -735,7 +751,7 @@ def objective_function(pipeline_individual, early_stop_tol = self.early_stop_tol, early_stop= self.early_stop, - + budget_range = self.budget_range, budget_scaling = self.budget_scaling, generations_until_end_budget = self.generations_until_end_budget, @@ -752,10 +768,11 @@ def objective_function(pipeline_individual, mutate_probability = self.mutate_probability, mutate_then_crossover_probability= self.mutate_then_crossover_probability, crossover_then_mutate_probability= self.crossover_then_mutate_probability, - + + rng_=self.rng, ) - + self._evolver_instance.optimize() #self._evolver_instance.population.update_pareto_fronts(self.objective_names, self.objective_function_weights) self.make_evaluated_individuals() @@ -765,24 +782,24 @@ def objective_function(pipeline_individual, pareto_front_inds = self.pareto_front['Individual'].values all_graphs, all_scores = tpot2.individual_representations.graph_pipeline_individual.simple_parallel_optuna(pareto_front_inds, objective_function, self.objective_function_weights, _client, storage=self.optuna_storage, steps=self.optuna_optimize_pareto_front_trials, verbose=self.verbose, max_eval_time_seconds=self.max_eval_time_seconds, max_time_seconds=self.optuna_optimize_pareto_front_timeout, **{"X": X, "y": y}) all_scores = tpot2.utils.eval_utils.process_scores(all_scores, len(self.objective_function_weights)) - + if len(all_graphs) > 0: df = pd.DataFrame(np.column_stack((all_graphs, all_scores,np.repeat("Optuna",len(all_graphs)))), columns=["Individual"] + self.objective_names +["Parents"]) for obj in self.objective_names: df[obj] = df[obj].apply(convert_to_float) - + self.evaluated_individuals = pd.concat([self.evaluated_individuals, df], ignore_index=True) else: print("WARNING NO OPTUNA TRIALS COMPLETED") - + tpot2.utils.get_pareto_frontier(self.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights, invalid_values=["TIMEOUT","INVALID"]) if validation_strategy == 'reshuffled': best_pareto_front_idx = list(self.pareto_front.index) best_pareto_front = list(self.pareto_front.loc[best_pareto_front_idx]['Individual']) - + #reshuffle rows - X, y = sklearn.utils.shuffle(X, y, random_state=1) + X, y = sklearn.utils.shuffle(X, y, random_state=self.random_state) if self.scatter: X_future = _client.scatter(X) @@ -791,30 +808,30 @@ def objective_function(pipeline_individual, X_future = X y_future = y - val_objective_function_list = [lambda ind, - X, - y, + val_objective_function_list = [lambda ind, + X, + y, is_classification=self.classification, - scorers= self._scorers, - cv=self.cv_gen, - other_objective_functions=self.other_objective_functions, - memory=self.memory, - cross_val_predict_cv=self.cross_val_predict_cv, - subset_column=self.subset_column, + scorers= self._scorers, + cv=self.cv_gen, + other_objective_functions=self.other_objective_functions, + memory=self.memory, + cross_val_predict_cv=self.cross_val_predict_cv, + subset_column=self.subset_column, **kwargs: objective_function_generator( ind, X, - y, + y, is_classification=is_classification, - scorers= scorers, - cv=cv, + scorers= scorers, + cv=cv, other_objective_functions=other_objective_functions, - memory=memory, - cross_val_predict_cv=cross_val_predict_cv, + memory=memory, + cross_val_predict_cv=cross_val_predict_cv, subset_column=subset_column, **kwargs, )] - + objective_kwargs = {"X": X_future, "y": y_future} val_scores = tpot2.utils.eval_utils.parallel_eval_objective_list( best_pareto_front, @@ -829,7 +846,7 @@ def objective_function(pipeline_individual, elif validation_strategy == 'split': - if self.scatter: + if self.scatter: X_future = _client.scatter(X) y_future = _client.scatter(y) X_val_future = _client.scatter(X_val) @@ -841,33 +858,33 @@ def objective_function(pipeline_individual, y_val_future = y_val objective_kwargs = {"X": X_future, "y": y_future, "X_val" : X_val_future, "y_val":y_val_future } - + best_pareto_front_idx = list(self.pareto_front.index) best_pareto_front = list(self.pareto_front.loc[best_pareto_front_idx]['Individual']) - val_objective_function_list = [lambda ind, - X, - y, - X_val, - y_val, - scorers= self._scorers, - other_objective_functions=self.other_objective_functions, - memory=self.memory, - cross_val_predict_cv=self.cross_val_predict_cv, - subset_column=self.subset_column, + val_objective_function_list = [lambda ind, + X, + y, + X_val, + y_val, + scorers= self._scorers, + other_objective_functions=self.other_objective_functions, + memory=self.memory, + cross_val_predict_cv=self.cross_val_predict_cv, + subset_column=self.subset_column, **kwargs: val_objective_function_generator( ind, X, y, - X_val, - y_val, - scorers= scorers, + X_val, + y_val, + scorers= scorers, other_objective_functions=other_objective_functions, - memory=memory, - cross_val_predict_cv=cross_val_predict_cv, + memory=memory, + cross_val_predict_cv=cross_val_predict_cv, subset_column=subset_column, **kwargs, )] - + val_scores = tpot2.utils.eval_utils.parallel_eval_objective_list( best_pareto_front, val_objective_function_list, n_jobs=self.n_jobs, verbose=self.verbose, timeout=self.max_eval_time_seconds,n_expected_columns=len(self.objective_names),client=_client, **objective_kwargs) @@ -879,25 +896,25 @@ def objective_function(pipeline_individual, else: self.objective_names_for_selection = self.objective_names - val_scores = self.evaluated_individuals[~self.evaluated_individuals[self.objective_names_for_selection].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names_for_selection].astype(float) + val_scores = self.evaluated_individuals[~self.evaluated_individuals[self.objective_names_for_selection].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names_for_selection].astype(float) weighted_scores = val_scores*self.objective_function_weights - + if self.bigger_is_better: best_idx = weighted_scores[self.objective_names_for_selection[0]].idxmax() else: best_idx = weighted_scores[self.objective_names_for_selection[0]].idxmin() - + best_individual = self.evaluated_individuals.loc[best_idx]['Individual'] self.selected_best_score = self.evaluated_individuals.loc[best_idx] - + best_individual_pipeline = best_individual.export_pipeline(memory=self.memory, cross_val_predict_cv=self.cross_val_predict_cv, subset_column=self.subset_column) if self.preprocessing: self.fitted_pipeline_ = sklearn.pipeline.make_pipeline(sklearn.base.clone(self._preprocessing_pipeline), best_individual_pipeline ) else: - self.fitted_pipeline_ = best_individual_pipeline - + self.fitted_pipeline_ = best_individual_pipeline + self.fitted_pipeline_.fit(X_original,y_original) #TODO use y_original as well? @@ -907,7 +924,7 @@ def objective_function(pipeline_individual, cluster.close() return self - + def _estimator_has(attr): '''Check if we can delegate a method to the underlying estimator. First, we check the first fitted final estimator if available, otherwise we @@ -919,7 +936,7 @@ def _estimator_has(attr): - + @available_if(_estimator_has('predict')) @@ -932,19 +949,19 @@ def predict(self, X, **predict_params): preds = self.label_encoder_.inverse_transform(preds) return preds - + @available_if(_estimator_has('predict_proba')) def predict_proba(self, X, **predict_params): check_is_fitted(self) #X = check_array(X) return self.fitted_pipeline_.predict_proba(X,**predict_params) - + @available_if(_estimator_has('decision_function')) def decision_function(self, X, **predict_params): check_is_fitted(self) #X = check_array(X) return self.fitted_pipeline_.decision_function(X,**predict_params) - + @available_if(_estimator_has('transform')) def transform(self, X, **predict_params): check_is_fitted(self) @@ -958,7 +975,7 @@ def classes_(self): return self.label_encoder_.classes_ else: return self.fitted_pipeline_.classes_ - + @property def _estimator_type(self): @@ -977,7 +994,7 @@ def make_evaluated_individuals(self): self.evaluated_individuals["Instance"] = self.evaluated_individuals["Individual"].apply(lambda ind: apply_make_pipeline(ind, preprocessing_pipeline=self._preprocessing_pipeline)) return self.evaluated_individuals - + @property def pareto_front(self): #check if _evolver_instance exists @@ -988,5 +1005,3 @@ def pareto_front(self): return self.evaluated_individuals else: return self.evaluated_individuals[self.evaluated_individuals["Pareto_Front"]==1] - - diff --git a/tpot2/tpot_estimator/estimator_utils.py b/tpot2/tpot_estimator/estimator_utils.py index 08d25f1b..07fe65ac 100644 --- a/tpot2/tpot_estimator/estimator_utils.py +++ b/tpot2/tpot_estimator/estimator_utils.py @@ -13,7 +13,7 @@ def convert_parents_tuples_to_integers(row, object_to_int): return np.nan def apply_make_pipeline(graphindividual, preprocessing_pipeline=None): - try: + try: if preprocessing_pipeline is None: return graphindividual.export_pipeline() else: @@ -21,13 +21,13 @@ def apply_make_pipeline(graphindividual, preprocessing_pipeline=None): except: return None -def get_configuration_dictionary(options, n_samples, n_features, classification, subsets=None, feature_names=None, n_classes=None): +def get_configuration_dictionary(options, n_samples, n_features, classification, random_state=None, cv=None, subsets=None, feature_names=None, n_classes=None): if options is None: return options if isinstance(options, dict): - return recursive_with_defaults(options, n_samples, n_features, classification, subsets=subsets, feature_names=feature_names) - + return recursive_with_defaults(options, n_samples, n_features, classification, random_state=None, cv=None, subsets=subsets, feature_names=feature_names, n_classes=n_classes) + if not isinstance(options, list): options = [options] @@ -36,23 +36,23 @@ def get_configuration_dictionary(options, n_samples, n_features, classification, for option in options: if option == "selectors": - config_dict.update(tpot2.config.make_selector_config_dictionary(classification)) + config_dict.update(tpot2.config.make_selector_config_dictionary(random_state=random_state, classifier=classification)) elif option == "classifiers": - config_dict.update(tpot2.config.make_classifier_config_dictionary(n_samples=n_samples, n_classes=n_classes)) + config_dict.update(tpot2.config.make_classifier_config_dictionary(random_state=random_state, n_samples=n_samples, n_classes=n_classes)) elif option == "classifiers_sklearnex": - config_dict.update(tpot2.config.make_sklearnex_classifier_config_dictionary(n_samples=n_samples, n_classes=n_classes)) + config_dict.update(tpot2.config.make_sklearnex_classifier_config_dictionary(random_state=random_state, n_samples=n_samples, n_classes=n_classes)) elif option == "regressors": - config_dict.update(tpot2.config.make_regressor_config_dictionary(n_samples=n_samples)) + config_dict.update(tpot2.config.make_regressor_config_dictionary(random_state=random_state, cv=cv, n_samples=n_samples)) elif option == "regressors_sklearnex": - config_dict.update(tpot2.config.make_sklearnex_regressor_config_dictionary(n_samples=n_samples)) + config_dict.update(tpot2.config.make_sklearnex_regressor_config_dictionary(random_state=random_state, n_samples=n_samples)) elif option == "transformers": - config_dict.update(tpot2.config.make_transformer_config_dictionary(n_features=n_features)) - + config_dict.update(tpot2.config.make_transformer_config_dictionary(random_state=random_state, n_features=n_features)) + elif option == "arithmetic_transformer": config_dict.update(tpot2.config.make_arithmetic_transformer_config_dictionary()) @@ -61,10 +61,10 @@ def get_configuration_dictionary(options, n_samples, n_features, classification, elif option == "skrebate": config_dict.update(tpot2.config.make_skrebate_config_dictionary(n_features=n_features)) - + elif option == "MDR": config_dict.update(tpot2.config.make_MDR_config_dictionary()) - + elif option == "continuousMDR": config_dict.update(tpot2.config.make_ContinuousMDR_config_dictionary()) @@ -76,26 +76,26 @@ def get_configuration_dictionary(options, n_samples, n_features, classification, elif option == "passthrough": config_dict.update(tpot2.config.make_passthrough_config_dictionary()) - + else: - config_dict.update(recursive_with_defaults(option, n_samples, n_features, classification, subsets=subsets, feature_names=feature_names)) + config_dict.update(recursive_with_defaults(options, n_samples, n_features, classification, random_state, cv, subsets=subsets, feature_names=feature_names, n_classes=n_classes)) if len(config_dict) == 0: raise ValueError("No valid configuration options were provided. Please check the options you provided and try again.") return config_dict -def recursive_with_defaults(config_dict, n_samples, n_features, classification, subsets=None, feature_names=None): - +def recursive_with_defaults(config_dict, n_samples, n_features, classification, random_state=None, cv=None, subsets=None, feature_names=None, n_classes=None): + for key in 'leaf_config_dict', 'root_config_dict', 'inner_config_dict', 'Recursive': if key in config_dict: value = config_dict[key] if key=="Resursive": - config_dict[key] = recursive_with_defaults(value,n_samples, n_features, classification, subsets=None, feature_names=None) + config_dict[key] = recursive_with_defaults(value, n_samples, n_features, classification, random_state, cv, subsets=None, feature_names=None, n_classes=None) else: - config_dict[key] = get_configuration_dictionary(value, n_samples, n_features, classification, subsets, feature_names) - + config_dict[key] = get_configuration_dictionary(value, n_samples, n_features, classification, random_state, cv, subsets, feature_names, n_classes) + return config_dict @@ -117,14 +117,14 @@ def objective_function_generator(pipeline, x,y, scorers, cv, other_objective_fun cv_obj_scores = cross_val_score_objective(sklearn.base.clone(pipeline),x,y,scorers=scorers, cv=cv , fold=step) else: cv_obj_scores = [] - + if other_objective_functions is not None and len(other_objective_functions) >0: other_scores = [obj(sklearn.base.clone(pipeline)) for obj in other_objective_functions] #flatten other_scores = np.array(other_scores).flatten().tolist() else: other_scores = [] - + return np.concatenate([cv_obj_scores,other_scores]) def val_objective_function_generator(pipeline, X_train, y_train, X_test, y_test, scorers, other_objective_functions, memory, cross_val_predict_cv, subset_column): @@ -134,12 +134,12 @@ def val_objective_function_generator(pipeline, X_train, y_train, X_test, y_test, fitted_pipeline.fit(X_train, y_train) if len(scorers) > 0: - scores =[sklearn.metrics.get_scorer(scorer)(fitted_pipeline, X_test, y_test) for scorer in scorers] + scores =[sklearn.metrics.get_scorer(scorer)(fitted_pipeline, X_test, y_test) for scorer in scorers] other_scores = [] if other_objective_functions is not None and len(other_objective_functions) >0: other_scores = [obj(sklearn.base.clone(pipeline)) for obj in other_objective_functions] - + return np.concatenate([scores,other_scores]) @@ -170,7 +170,7 @@ def convert_to_float(x): return float(x) except ValueError: return x - + @@ -180,9 +180,3 @@ def check_if_y_is_encoded(y): ''' y = sorted(set(y)) return all(i == j for i, j in enumerate(y)) - - - - - - diff --git a/tpot2/tpot_estimator/steady_state_estimator.py b/tpot2/tpot_estimator/steady_state_estimator.py index 72f7f595..0f48c827 100644 --- a/tpot2/tpot_estimator/steady_state_estimator.py +++ b/tpot2/tpot_estimator/steady_state_estimator.py @@ -5,8 +5,8 @@ import tpot2.config from sklearn.utils.validation import check_is_fitted from tpot2.selectors import survival_select_NSGA2, tournament_selection_dominated -from sklearn.preprocessing import LabelEncoder -from sklearn.utils.multiclass import unique_labels +from sklearn.preprocessing import LabelEncoder +from sklearn.utils.multiclass import unique_labels import pandas as pd from sklearn.model_selection import train_test_split import tpot2 @@ -27,7 +27,7 @@ def set_dask_settings(): #TODO inherit from _BaseComposition? class TPOTEstimatorSteadyState(BaseEstimator): - def __init__(self, scorers= [], + def __init__(self, scorers= [], scorers_weights = [], classification = False, cv = 5, @@ -35,16 +35,16 @@ def __init__(self, scorers= [], other_objective_functions_weights = [], objective_function_names = None, bigger_is_better = True, - max_size = np.inf, + max_size = np.inf, linear_pipeline = False, root_config_dict= 'Auto', inner_config_dict=["selectors", "transformers"], - leaf_config_dict= None, + leaf_config_dict= None, cross_val_predict_cv = 0, categorical_features = None, subsets = None, memory = None, - preprocessing = False, + preprocessing = False, validation_strategy = "none", validation_fraction = .2, disable_label_encoder = False, @@ -53,14 +53,14 @@ def __init__(self, scorers= [], population_size = 50, max_evaluated_individuals = None, - + early_stop = None, early_stop_seconds = None, scorers_early_stop_tol = 0.001, other_objectives_early_stop_tol = None, - max_time_seconds=None, - max_eval_time_seconds=60*10, + max_time_seconds=None, + max_eval_time_seconds=60*10, n_jobs=1, memory_limit = "4GB", client = None, @@ -73,65 +73,68 @@ def __init__(self, scorers= [], parent_selector = tournament_selection_dominated, budget_range = None, budget_scaling = .5, - individuals_until_end_budget = 1, + individuals_until_end_budget = 1, stepwise_steps = 5, warm_start = False, subset_column = None, verbose = 0, - periodic_checkpoint_folder = None, + periodic_checkpoint_folder = None, callback = None, processes = True, scatter = True, + # random seed for random number generator (rng) + random_state = None, + optuna_optimize_pareto_front = False, optuna_optimize_pareto_front_trials = 100, optuna_optimize_pareto_front_timeout = 60*10, optuna_storage = "sqlite:///optuna.db", ): - + ''' An sklearn baseestimator that uses genetic programming to optimize a pipeline. - + Parameters ---------- - + scorers : (list, scorer) - A scorer or list of scorers to be used in the cross-validation process. + A scorer or list of scorers to be used in the cross-validation process. see https://scikit-learn.org/stable/modules/model_evaluation.html - + scorers_weights : list A list of weights to be applied to the scorers during the optimization process. - + classification : bool If True, the problem is treated as a classification problem. If False, the problem is treated as a regression problem. Used to determine the CV strategy. - + cv : int, cross-validator - (int): Number of folds to use in the cross-validation process. By uses the sklearn.model_selection.KFold cross-validator for regression and StratifiedKFold for classification. In both cases, shuffled is set to True. - (sklearn.model_selection.BaseCrossValidator): A cross-validator to use in the cross-validation process. - + other_objective_functions : list, default=[] A list of other objective functions to apply to the pipeline. The function takes a single parameter for the graphpipeline estimator and returns either a single score or a list of scores. - + other_objective_functions_weights : list, default=[] A list of weights to be applied to the other objective functions. - + objective_function_names : list, default=None A list of names to be applied to the objective functions. If None, will use the names of the objective functions. - + bigger_is_better : bool, default=True If True, the objective function is maximized. If False, the objective function is minimized. Use negative weights to reverse the direction. - + max_size : int, default=np.inf The maximum number of nodes of the pipelines to be generated. - + linear_pipeline : bool, default=False If True, the pipelines generated will be linear. If False, the pipelines generated will be directed acyclic graphs. - + root_config_dict : dict, default='auto' The configuration dictionary to use for the root node of the model. If 'auto', will use "classifiers" if classification=True, else "regressors". @@ -149,7 +152,7 @@ def __init__(self, scorers= [], - 'genetic encoders' : Includes Genetic Encoder methods as used in AutoQTL. - 'FeatureEncodingFrequencySelector': Includes FeatureEncodingFrequencySelector method as used in AutoQTL. - list : a list of strings out of the above options to include the corresponding methods in the configuration dictionary. - + inner_config_dict : dict, default=["selectors", "transformers"] The configuration dictionary to use for the inner nodes of the model generation. Default ["selectors", "transformers"] @@ -168,10 +171,10 @@ def __init__(self, scorers= [], - 'FeatureEncodingFrequencySelector': Includes FeatureEncodingFrequencySelector method as used in AutoQTL. - list : a list of strings out of the above options to include the corresponding methods in the configuration dictionary. - None : If None and max_depth>1, the root_config_dict will be used for the inner nodes as well. - - leaf_config_dict : dict, default=None + + leaf_config_dict : dict, default=None The configuration dictionary to use for the leaf node of the model. If set, leaf nodes must be from this dictionary. - Otherwise leaf nodes will be generated from the root_config_dict. + Otherwise leaf nodes will be generated from the root_config_dict. Default None - 'selectors' : A selection of sklearn Selector methods. - 'classifiers' : A selection of sklearn Classifier methods. @@ -188,14 +191,14 @@ def __init__(self, scorers= [], - 'FeatureEncodingFrequencySelector': Includes FeatureEncodingFrequencySelector method as used in AutoQTL. - list : a list of strings out of the above options to include the corresponding methods in the configuration dictionary. - None : If None, a leaf will not be required (i.e. the pipeline can be a single root node). Leaf nodes will be generated from the inner_config_dict. - + cross_val_predict_cv : int, default=0 Number of folds to use for the cross_val_predict function for inner classifiers and regressors. Estimators will still be fit on the full dataset, but the following node will get the outputs from cross_val_predict. - + - 0-1 : When set to 0 or 1, the cross_val_predict function will not be used. The next layer will get the outputs from fitting and transforming the full dataset. - - >=2 : When fitting pipelines with inner classifiers or regressors, they will still be fit on the full dataset. + - >=2 : When fitting pipelines with inner classifiers or regressors, they will still be fit on the full dataset. However, the output to the next node will come from cross_val_predict with the specified number of folds. - + categorical_features: list or None Categorical columns to inpute and/or one hot encode during the preprocessing step. Used only if preprocessing is not False. - None : If None, TPOT2 will automatically use object columns in pandas dataframes as objects for one hot encoding in preprocessing. @@ -203,7 +206,7 @@ def __init__(self, scorers= [], subsets : str or list, default=None Sets the subsets that the FeatureSetSeletor will select from if set as an option in one of the configuration dictionaries. - - str : If a string, it is assumed to be a path to a csv file with the subsets. + - str : If a string, it is assumed to be a path to a csv file with the subsets. The first column is assumed to be the name of the subset and the remaining columns are the features in the subset. - list or np.ndarray : If a list or np.ndarray, it is assumed to be a list of subsets. - None : If None, each column will be treated as a subset. One column will be selected per subset. @@ -226,186 +229,193 @@ def __init__(self, scorers= [], - None: TPOT does not use memory caching. - preprocessing : bool or BaseEstimator/Pipeline, + preprocessing : bool or BaseEstimator/Pipeline, EXPERIMENTAL A pipeline that will be used to preprocess the data before CV. - bool : If True, will use a default preprocessing pipeline. - Pipeline : If an instance of a pipeline is given, will use that pipeline as the preprocessing pipeline. - + validation_strategy : str, default='none' EXPERIMENTAL The validation strategy to use for selecting the final pipeline from the population. TPOT2 may overfit the cross validation score. A second validation set can be used to select the final pipeline. - 'auto' : Automatically determine the validation strategy based on the dataset shape. - - 'reshuffled' : Use the same data for cross validation and final validation, but with different splits for the folds. This is the default for small datasets. - - 'split' : Use a separate validation set for final validation. Data will be split according to validation_fraction. This is the default for medium datasets. + - 'reshuffled' : Use the same data for cross validation and final validation, but with different splits for the folds. This is the default for small datasets. + - 'split' : Use a separate validation set for final validation. Data will be split according to validation_fraction. This is the default for medium datasets. - 'none' : Do not use a separate validation set for final validation. Select based on the original cross-validation score. This is the default for large datasets. validation_fraction : float, default=0.2 EXPERIMENTAL The fraction of the dataset to use for the validation set when validation_strategy is 'split'. Must be between 0 and 1. - + disable_label_encoder : bool, default=False If True, TPOT will check if the target needs to be relabeled to be sequential ints from 0 to N. This is necessary for XGBoost compatibility. If the labels need to be encoded, TPOT2 will use sklearn.preprocessing.LabelEncoder to encode the labels. The encoder can be accessed via the self.label_encoder_ attribute. If False, no additional label encoders will be used. population_size : int, default=50 Size of the population - + initial_population_size : int, default=None Size of the initial population. If None, population_size will be used. - + population_scaling : int, default=0.5 Scaling factor to use when determining how fast we move the threshold moves from the start to end percentile. - - generations_until_end_population : int, default=1 - Number of generations until the population size reaches population_size - + + generations_until_end_population : int, default=1 + Number of generations until the population size reaches population_size + generations : int, default=50 Number of generations to run - + early_stop : int, default=None Number of evaluated individuals without improvement before early stopping. Counted across all objectives independently. Triggered when all objectives have not improved by the given number of individuals. - + early_stop_seconds : float, default=None Number of seconds without improvement before early stopping. All objectives must not have improved for the given number of seconds for this to be triggered. - scorers_early_stop_tol : + scorers_early_stop_tol : -list of floats list of tolerances for each scorer. If the difference between the best score and the current score is less than the tolerance, the individual is considered to have converged If an index of the list is None, that item will not be used for early stopping - -int + -int If an int is given, it will be used as the tolerance for all objectives - - other_objectives_early_stop_tol : + + other_objectives_early_stop_tol : -list of floats list of tolerances for each of the other objective function. If the difference between the best score and the current score is less than the tolerance, the individual is considered to have converged If an index of the list is None, that item will not be used for early stopping - -int + -int If an int is given, it will be used as the tolerance for all objectives - + max_time_seconds : float, default=float("inf") Maximum time to run the optimization. If none or inf, will run until the end of the generations. - + max_eval_time_seconds : float, default=60*5 Maximum time to evaluate a single individual. If none or inf, there will be no time limit per evaluation. - + n_jobs : int, default=1 Number of processes to run in parallel. - + memory_limit : str, default="4GB" Memory limit for each job. See Dask [LocalCluster documentation](https://distributed.dask.org/en/stable/api.html#distributed.Client) for more information. - + client : dask.distributed.Client, default=None - A dask client to use for parallelization. If not None, this will override the n_jobs and memory_limit parameters. If None, will create a new client with num_workers=n_jobs and memory_limit=memory_limit. + A dask client to use for parallelization. If not None, this will override the n_jobs and memory_limit parameters. If None, will create a new client with num_workers=n_jobs and memory_limit=memory_limit. crossover_probability : float, default=.2 Probability of generating a new individual by crossover between two individuals. - + mutate_probability : float, default=.7 Probability of generating a new individual by crossover between one individuals. - + mutate_then_crossover_probability : float, default=.05 Probability of generating a new individual by mutating two individuals followed by crossover. - + crossover_then_mutate_probability : float, default=.05 Probability of generating a new individual by crossover between two individuals followed by a mutation of the resulting individual. - + survival_selector : function, default=survival_select_NSGA2 Function to use to select individuals for survival. Must take a matrix of scores and return selected indexes. Used to selected population_size individuals at the start of each generation to use for mutation and crossover. - + parent_selector : function, default=parent_select_NSGA2 Function to use to select pairs parents for crossover and individuals for mutation. Must take a matrix of scores and return selected indexes. - + budget_range : list [start, end], default=None A starting and ending budget to use for the budget scaling. - + budget_scaling float : [0,1], default=0.5 A scaling factor to use when determining how fast we move the budget from the start to end budget. - + individuals_until_end_budget : int, default=1 The number of generations to run before reaching the max budget. - + stepwise_steps : int, default=1 The number of staircase steps to take when scaling the budget and population size. - + threshold_evaluation_early_stop : list [start, end], default=None starting and ending percentile to use as a threshold for the evaluation early stopping. Values between 0 and 100. - + threshold_evaluation_scaling : float [0,inf), default=0.5 A scaling factor to use when determining how fast we move the threshold moves from the start to end percentile. Must be greater than zero. Higher numbers will move the threshold to the end faster. - + min_history_threshold : int, default=0 The minimum number of previous scores needed before using threshold early stopping. - + selection_evaluation_early_stop : list, default=None A lower and upper percent of the population size to select each round of CV. Values between 0 and 1. - - selection_evaluation_scaling : float, default=0.5 + + selection_evaluation_scaling : float, default=0.5 A scaling factor to use when determining how fast we move the threshold moves from the start to end percentile. Must be greater than zero. Higher numbers will move the threshold to the end faster. - + n_initial_optimizations : int, default=0 Number of individuals to optimize before starting the evolution. - - optimization_cv : int + + optimization_cv : int Number of folds to use for the optuna optimization's internal cross-validation. - + max_optimize_time_seconds : float, default=60*5 Maximum time to run an optimization - + optimization_steps : int, default=10 Number of steps per optimization - + warm_start : bool, default=False If True, will use the continue the evolutionary algorithm from the last generation of the previous run. - + subset_column : str or int, default=None EXPERIMENTAL The column to use for the subset selection. Must also pass in unique_subset_values to GraphIndividual to function. - - verbose : int, default=1 + + verbose : int, default=1 How much information to print during the optimization process. Higher values include the information from lower values. 0. nothing 1. progress bar - + 3. best individual 4. warnings >=5. full warnings trace - - + + random_state : int, None, default=None + A seed for reproducability of experiments. This value will be passed to numpy.random.default_rng() to create an instnce of the genrator to pass to other classes + - int + Will be used to create and lock in Generator instance with 'numpy.random.default_rng()' + - None + Will be used to create Generator for 'numpy.random.default_rng()' where a fresh, unpredictable entropy will be pulled from the OS + + periodic_checkpoint_folder : str, default=None Folder to save the population to periodically. If None, no periodic saving will be done. If provided, training will resume from this checkpoint. - + callback : tpot2.CallBackInterface, default=None Callback object. Not implemented processes : bool, default=True If True, will use multiprocessing to parallelize the optimization process. If False, will use threading. True seems to perform better. However, False is required for interactive debugging. - + Attributes ---------- fitted_pipeline_ : GraphPipeline A fitted instance of the GraphPipeline that inherits from sklearn BaseEstimator. This is fitted on the full X, y passed to fit. - evaluated_individuals : A pandas data frame containing data for all evaluated individuals in the run. - Columns: + evaluated_individuals : A pandas data frame containing data for all evaluated individuals in the run. + Columns: - *objective functions : The first few columns correspond to the passed in scorers and objective functions - Parents : A tuple containing the indexes of the pipelines used to generate the pipeline of that row. If NaN, this pipeline was generated randomly in the initial population. - Variation_Function : Which variation function was used to mutate or crossover the parents. If NaN, this pipeline was generated randomly in the initial population. - Individual : The internal representation of the individual that is used during the evolutionary algorithm. This is not an sklearn BaseEstimator. - - Generation : The generation the pipeline first appeared. - - Pareto_Front : The nondominated front that this pipeline belongs to. 0 means that its scores is not strictly dominated by any other individual. - To save on computational time, the best frontier is updated iteratively each generation. + - Generation : The generation the pipeline first appeared. + - Pareto_Front : The nondominated front that this pipeline belongs to. 0 means that its scores is not strictly dominated by any other individual. + To save on computational time, the best frontier is updated iteratively each generation. The pipelines with the 0th pareto front do represent the exact best frontier. However, the pipelines with pareto front >= 1 are only in reference to the other pipelines in the final population. - All other pipelines are set to NaN. - - Instance : The unfitted GraphPipeline BaseEstimator. + All other pipelines are set to NaN. + - Instance : The unfitted GraphPipeline BaseEstimator. - *validation objective functions : Objective function scores evaluated on the validation set. - Validation_Pareto_Front : The full pareto front calculated on the validation set. This is calculated for all pipelines with Pareto_Front equal to 0. Unlike the Pareto_Front which only calculates the frontier and the final population, the Validation Pareto Front is calculated for all pipelines tested on the validation set. - + pareto_front : The same pandas dataframe as evaluated individuals, but containing only the frontier pareto front pipelines. ''' @@ -440,7 +450,7 @@ def __init__(self, scorers= [], self.early_stop_seconds = early_stop_seconds self.scorers_early_stop_tol = scorers_early_stop_tol self.other_objectives_early_stop_tol = other_objectives_early_stop_tol - self.max_time_seconds = max_time_seconds + self.max_time_seconds = max_time_seconds self.max_eval_time_seconds = max_eval_time_seconds self.n_jobs= n_jobs self.memory_limit = memory_limit @@ -473,6 +483,13 @@ def __init__(self, scorers= [], self.optuna_optimize_pareto_front_timeout = optuna_optimize_pareto_front_timeout self.optuna_storage = optuna_storage + # create random number generator based on rng_seed + self.rng = np.random.default_rng(random_state) + # save random state passed to us for other functions that use random_state + self.random_state = random_state + # set the numpy seed so anything using it will be consistent as well + np.random.seed(random_state) + self.max_evaluated_individuals = max_evaluated_individuals @@ -491,24 +508,24 @@ def __init__(self, scorers= [], self._scorers = [self.scorers] else: self._scorers = self.scorers - + self._scorers = [sklearn.metrics.get_scorer(scoring) for scoring in self._scorers] self._scorers_early_stop_tol = self.scorers_early_stop_tol - + self._evolver = tpot2.evolvers.SteadyStateEvolver - - + + self.objective_function_weights = [*scorers_weights, *other_objective_functions_weights] - + if self.objective_function_names is None: obj_names = [f.__name__ for f in other_objective_functions] else: obj_names = self.objective_function_names self.objective_names = [f._score_func.__name__ if hasattr(f,"_score_func") else f.__name__ for f in self._scorers] + obj_names - - + + if not isinstance(self.other_objectives_early_stop_tol, list): self._other_objectives_early_stop_tol = [self.other_objectives_early_stop_tol for _ in range(len(self.other_objective_functions))] else: @@ -520,7 +537,7 @@ def __init__(self, scorers= [], self._scorers_early_stop_tol = self._scorers_early_stop_tol self.early_stop_tol = [*self._scorers_early_stop_tol, *self._other_objectives_early_stop_tol] - + self._evolver_instance = None self.evaluated_individuals = None @@ -550,8 +567,8 @@ def fit(self, X, y): if self.classification and not self.disable_label_encoder and not check_if_y_is_encoded(y): warnings.warn("Labels are not encoded as ints from 0 to N. For compatibility with some classifiers such as sklearn, TPOT has encoded y with the sklearn LabelEncoder. When using pipelines outside the main TPOT estimator class, you can encode the labels with est.label_encoder_") - self.label_encoder_ = LabelEncoder() - y = self.label_encoder_.fit_transform(y) + self.label_encoder_ = LabelEncoder() + y = self.label_encoder_.fit_transform(y) self.evaluated_individuals = None #determine validation strategy @@ -570,9 +587,9 @@ def fit(self, X, y): if validation_strategy == 'split': if self.classification: - X, X_val, y, y_val = train_test_split(X, y, test_size=self.validation_fraction, stratify=y, random_state=42) + X, X_val, y, y_val = train_test_split(X, y, test_size=self.validation_fraction, stratify=y, random_state=self.random_state) else: - X, X_val, y, y_val = train_test_split(X, y, test_size=self.validation_fraction, random_state=42) + X, X_val, y, y_val = train_test_split(X, y, test_size=self.validation_fraction, random_state=self.random_state) X_original = X @@ -584,7 +601,7 @@ def fit(self, X, y): if self.classification: X, y = remove_underrepresented_classes(X, y, n_folds) - + if self.preprocessing: #X = pd.DataFrame(X) @@ -602,7 +619,7 @@ def fit(self, X, y): tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'), #impute remaining numeric columns tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, min_frequency=0.0001)) #one hot encode categorical columns else: #numpy array and no categorical columns specified, just do imputation - self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')) + self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')) else: @@ -612,6 +629,15 @@ def fit(self, X, y): #Set up the configuation dictionaries and the search spaces + #check if self.cv is a number + if isinstance(self.cv, int) or isinstance(self.cv, float): + if self.classification: + self.cv_gen = sklearn.model_selection.StratifiedKFold(n_splits=self.cv, shuffle=True, random_state=self.random_state) + else: + self.cv_gen = sklearn.model_selection.KFold(n_splits=self.cv, shuffle=True, random_state=self.random_state) + + else: + self.cv_gen = sklearn.model_selection.check_cv(self.cv, y, classifier=self.classification) n_samples= int(math.floor(X.shape[0]/n_folds)) @@ -625,54 +651,43 @@ def fit(self, X, y): if self.root_config_dict == 'Auto': if self.classification: n_classes = len(np.unique(y)) - root_config_dict = get_configuration_dictionary("classifiers", n_samples, n_features, self.classification, subsets=self.subsets, feature_names=self.feature_names, n_classes=n_classes) + root_config_dict = get_configuration_dictionary("classifiers", n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names, n_classes=n_classes) else: - root_config_dict = get_configuration_dictionary("regressors", n_samples, n_features, self.classification,subsets=self.subsets, feature_names=self.feature_names) + root_config_dict = get_configuration_dictionary("regressors", n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names) else: - root_config_dict = get_configuration_dictionary(self.root_config_dict, n_samples, n_features, self.classification, subsets=self.subsets,feature_names=self.feature_names) - - inner_config_dict = get_configuration_dictionary(self.inner_config_dict, n_samples, n_features, self.classification,subsets=self.subsets, feature_names=self.feature_names) - leaf_config_dict = get_configuration_dictionary(self.leaf_config_dict, n_samples, n_features, self.classification, subsets=self.subsets, feature_names=self.feature_names) + root_config_dict = get_configuration_dictionary(self.root_config_dict, n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets,feature_names=self.feature_names) + inner_config_dict = get_configuration_dictionary(self.inner_config_dict, n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names) + leaf_config_dict = get_configuration_dictionary(self.leaf_config_dict, n_samples, n_features, self.classification, self.random_state, self.cv_gen, subsets=self.subsets, feature_names=self.feature_names) - #check if self.cv is a number - if isinstance(self.cv, int) or isinstance(self.cv, float): - if self.classification: - self.cv_gen = sklearn.model_selection.StratifiedKFold(n_splits=self.cv, shuffle=True, random_state=42) - else: - self.cv_gen = sklearn.model_selection.KFold(n_splits=self.cv, shuffle=True, random_state=42) - - else: - self.cv_gen = sklearn.model_selection.check_cv(self.cv, y, classifier=self.classification) - - def objective_function(pipeline_individual, - X, + def objective_function(pipeline_individual, + X, y, is_classification=self.classification, - scorers= self._scorers, - cv=self.cv_gen, + scorers= self._scorers, + cv=self.cv_gen, other_objective_functions=self.other_objective_functions, - memory=self.memory, - cross_val_predict_cv=self.cross_val_predict_cv, - subset_column=self.subset_column, - **kwargs): + memory=self.memory, + cross_val_predict_cv=self.cross_val_predict_cv, + subset_column=self.subset_column, + **kwargs): return objective_function_generator( pipeline_individual, - X, - y, + X, + y, is_classification=is_classification, - scorers= scorers, - cv=cv, + scorers= scorers, + cv=cv, other_objective_functions=other_objective_functions, - memory=memory, - cross_val_predict_cv=cross_val_predict_cv, + memory=memory, + cross_val_predict_cv=cross_val_predict_cv, subset_column=subset_column, **kwargs, ) - self.individual_generator_instance = tpot2.individual_representations.graph_pipeline_individual.estimator_graph_individual_generator( + self.individual_generator_instance = tpot2.individual_representations.graph_pipeline_individual.estimator_graph_individual_generator( inner_config_dict=inner_config_dict, root_config_dict=root_config_dict, leaf_config_dict=leaf_config_dict, @@ -691,7 +706,7 @@ def objective_function(pipeline_individual, #If warm start and we have an evolver instance, use the existing one if not(self.warm_start and self._evolver_instance is not None): - self._evolver_instance = self._evolver( individual_generator=self.individual_generator_instance, + self._evolver_instance = self._evolver( individual_generator=self.individual_generator_instance, objective_functions= [objective_function], objective_function_weights = self.objective_function_weights, objective_names=self.objective_names, @@ -703,8 +718,8 @@ def objective_function(pipeline_individual, verbose = self.verbose, max_time_seconds = self.max_time_seconds , max_eval_time_seconds = self.max_eval_time_seconds, - - + + periodic_checkpoint_folder = self.periodic_checkpoint_folder, @@ -712,7 +727,7 @@ def objective_function(pipeline_individual, early_stop_tol = self.early_stop_tol, early_stop= self.early_stop, early_stop_seconds = self.early_stop_seconds, - + budget_range = self.budget_range, budget_scaling = self.budget_scaling, individuals_until_end_budget = self.individuals_until_end_budget, @@ -728,12 +743,14 @@ def objective_function(pipeline_individual, mutate_probability = self.mutate_probability, mutate_then_crossover_probability= self.mutate_then_crossover_probability, crossover_then_mutate_probability= self.crossover_then_mutate_probability, - - max_evaluated_individuals = self.max_evaluated_individuals + + max_evaluated_individuals = self.max_evaluated_individuals, + + rng_=self.rng, ) - + self._evolver_instance.optimize() #self._evolver_instance.population.update_pareto_fronts(self.objective_names, self.objective_function_weights) self.make_evaluated_individuals() @@ -743,24 +760,24 @@ def objective_function(pipeline_individual, pareto_front_inds = self.pareto_front['Individual'].values all_graphs, all_scores = tpot2.individual_representations.graph_pipeline_individual.simple_parallel_optuna(pareto_front_inds, objective_function, self.objective_function_weights, _client, storage=self.optuna_storage, steps=self.optuna_optimize_pareto_front_trials, verbose=self.verbose, max_eval_time_seconds=self.max_eval_time_seconds, max_time_seconds=self.optuna_optimize_pareto_front_timeout, **{"X": X, "y": y}) all_scores = tpot2.utils.eval_utils.process_scores(all_scores, len(self.objective_function_weights)) - + if len(all_graphs) > 0: df = pd.DataFrame(np.column_stack((all_graphs, all_scores,np.repeat("Optuna",len(all_graphs)))), columns=["Individual"] + self.objective_names +["Parents"]) for obj in self.objective_names: df[obj] = df[obj].apply(convert_to_float) - + self.evaluated_individuals = pd.concat([self.evaluated_individuals, df], ignore_index=True) else: print("WARNING NO OPTUNA TRIALS COMPLETED") - + tpot2.utils.get_pareto_frontier(self.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights, invalid_values=["TIMEOUT","INVALID"]) if validation_strategy == 'reshuffled': best_pareto_front_idx = list(self.pareto_front.index) best_pareto_front = list(self.pareto_front.loc[best_pareto_front_idx]['Individual']) - + #reshuffle rows - X, y = sklearn.utils.shuffle(X, y, random_state=1) + X, y = sklearn.utils.shuffle(X, y, random_state=self.random_state) if self.scatter: X_future = _client.scatter(X) @@ -769,30 +786,30 @@ def objective_function(pipeline_individual, X_future = X y_future = y - val_objective_function_list = [lambda ind, - X, - y, + val_objective_function_list = [lambda ind, + X, + y, is_classification=self.classification, - scorers= self._scorers, - cv=self.cv_gen, - other_objective_functions=self.other_objective_functions, - memory=self.memory, - cross_val_predict_cv=self.cross_val_predict_cv, - subset_column=self.subset_column, + scorers= self._scorers, + cv=self.cv_gen, + other_objective_functions=self.other_objective_functions, + memory=self.memory, + cross_val_predict_cv=self.cross_val_predict_cv, + subset_column=self.subset_column, **kwargs: objective_function_generator( ind, X, - y, + y, is_classification=is_classification, - scorers= scorers, - cv=cv, + scorers= scorers, + cv=cv, other_objective_functions=other_objective_functions, - memory=memory, - cross_val_predict_cv=cross_val_predict_cv, + memory=memory, + cross_val_predict_cv=cross_val_predict_cv, subset_column=subset_column, **kwargs, )] - + objective_kwargs = {"X": X_future, "y": y_future} val_scores = tpot2.utils.eval_utils.parallel_eval_objective_list( best_pareto_front, @@ -807,7 +824,7 @@ def objective_function(pipeline_individual, elif validation_strategy == 'split': - if self.scatter: + if self.scatter: X_future = _client.scatter(X) y_future = _client.scatter(y) X_val_future = _client.scatter(X_val) @@ -819,33 +836,33 @@ def objective_function(pipeline_individual, y_val_future = y_val objective_kwargs = {"X": X_future, "y": y_future, "X_val" : X_val_future, "y_val":y_val_future } - + best_pareto_front_idx = list(self.pareto_front.index) best_pareto_front = list(self.pareto_front.loc[best_pareto_front_idx]['Individual']) - val_objective_function_list = [lambda ind, - X, - y, - X_val, - y_val, - scorers= self._scorers, - other_objective_functions=self.other_objective_functions, - memory=self.memory, - cross_val_predict_cv=self.cross_val_predict_cv, - subset_column=self.subset_column, + val_objective_function_list = [lambda ind, + X, + y, + X_val, + y_val, + scorers= self._scorers, + other_objective_functions=self.other_objective_functions, + memory=self.memory, + cross_val_predict_cv=self.cross_val_predict_cv, + subset_column=self.subset_column, **kwargs: val_objective_function_generator( ind, X, y, - X_val, - y_val, - scorers= scorers, + X_val, + y_val, + scorers= scorers, other_objective_functions=other_objective_functions, - memory=memory, - cross_val_predict_cv=cross_val_predict_cv, + memory=memory, + cross_val_predict_cv=cross_val_predict_cv, subset_column=subset_column, **kwargs, )] - + val_scores = tpot2.utils.eval_utils.parallel_eval_objective_list( best_pareto_front, val_objective_function_list, n_jobs=self.n_jobs, verbose=self.verbose, timeout=self.max_eval_time_seconds,n_expected_columns=len(self.objective_names),client=_client, **objective_kwargs) @@ -857,25 +874,25 @@ def objective_function(pipeline_individual, else: self.objective_names_for_selection = self.objective_names - val_scores = self.evaluated_individuals[~self.evaluated_individuals[self.objective_names_for_selection].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names_for_selection].astype(float) + val_scores = self.evaluated_individuals[~self.evaluated_individuals[self.objective_names_for_selection].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names_for_selection].astype(float) weighted_scores = val_scores*self.objective_function_weights - + if self.bigger_is_better: best_idx = weighted_scores[self.objective_names_for_selection[0]].idxmax() else: best_idx = weighted_scores[self.objective_names_for_selection[0]].idxmin() - + best_individual = self.evaluated_individuals.loc[best_idx]['Individual'] self.selected_best_score = self.evaluated_individuals.loc[best_idx] - + best_individual_pipeline = best_individual.export_pipeline(memory=self.memory, cross_val_predict_cv=self.cross_val_predict_cv, subset_column=self.subset_column) if self.preprocessing: self.fitted_pipeline_ = sklearn.pipeline.make_pipeline(sklearn.base.clone(self._preprocessing_pipeline), best_individual_pipeline ) else: - self.fitted_pipeline_ = best_individual_pipeline - + self.fitted_pipeline_ = best_individual_pipeline + self.fitted_pipeline_.fit(X_original,y_original) #TODO use y_original as well? @@ -885,7 +902,7 @@ def objective_function(pipeline_individual, cluster.close() return self - + def _estimator_has(attr): '''Check if we can delegate a method to the underlying estimator. First, we check the first fitted final estimator if available, otherwise we @@ -897,7 +914,7 @@ def _estimator_has(attr): - + @available_if(_estimator_has('predict')) @@ -907,21 +924,21 @@ def predict(self, X, **predict_params): preds = self.fitted_pipeline_.predict(X,**predict_params) if self.classification and self.label_encoder_: preds = self.label_encoder_.inverse_transform(preds) - + return preds - + @available_if(_estimator_has('predict_proba')) def predict_proba(self, X, **predict_params): check_is_fitted(self) #X = check_array(X) return self.fitted_pipeline_.predict_proba(X,**predict_params) - + @available_if(_estimator_has('decision_function')) def decision_function(self, X, **predict_params): check_is_fitted(self) #X = check_array(X) return self.fitted_pipeline_.decision_function(X,**predict_params) - + @available_if(_estimator_has('transform')) def transform(self, X, **predict_params): check_is_fitted(self) @@ -931,7 +948,7 @@ def transform(self, X, **predict_params): @property def classes_(self): """The classes labels. Only exist if the last step is a classifier.""" - + if self.label_encoder_: return self.label_encoder_.classes_ else: @@ -953,7 +970,7 @@ def make_evaluated_individuals(self): self.evaluated_individuals["Instance"] = self.evaluated_individuals["Individual"].apply(lambda ind: apply_make_pipeline(ind, preprocessing_pipeline=self._preprocessing_pipeline)) return self.evaluated_individuals - + @property def pareto_front(self): #check if _evolver_instance exists @@ -964,5 +981,3 @@ def pareto_front(self): return self.evaluated_individuals else: return self.evaluated_individuals[self.evaluated_individuals["Pareto_Front"]==1] - -