From 9b987e0f347b54a05308096780a38d4dba4e888d Mon Sep 17 00:00:00 2001 From: perib Date: Wed, 24 Apr 2024 17:10:35 -0700 Subject: [PATCH 1/7] edit default space --- tpot2/tpot_estimator/templates/tpottemplates.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tpot2/tpot_estimator/templates/tpottemplates.py b/tpot2/tpot_estimator/templates/tpottemplates.py index 1982db78..4b0e90ad 100644 --- a/tpot2/tpot_estimator/templates/tpottemplates.py +++ b/tpot2/tpot_estimator/templates/tpottemplates.py @@ -77,13 +77,13 @@ def fit(self, X, y): search_space = tpot2.search_spaces.pipelines.GraphPipeline( root_search_space= tpot2.config.get_search_space("regressors", **get_search_space_params), leaf_search_space = None, - inner_search_space = tpot2.config.get_search_space(["selectors","transformers","regressors","scalers"],**get_search_space_params), + inner_search_space = tpot2.config.get_search_space(["selectors","transformers","regressors","scalers","selectors_regression"],**get_search_space_params), ) else: search_space = tpot2.search_spaces.pipelines.GraphPipeline( root_search_space= tpot2.config.get_search_space("regressors", **get_search_space_params), leaf_search_space = None, - inner_search_space = tpot2.config.get_search_space(["selectors","transformers","scalers"],**get_search_space_params), + inner_search_space = tpot2.config.get_search_space(["selectors","transformers","scalers","selectors_regression"],**get_search_space_params), ) super(TPOTRegressor,self).__init__( @@ -189,13 +189,13 @@ def fit(self, X, y): search_space = tpot2.search_spaces.pipelines.GraphPipeline( root_search_space= tpot2.config.get_search_space("classifiers", **get_search_space_params), leaf_search_space = None, - inner_search_space = tpot2.config.get_search_space(["selectors","transformers","classifiers", "scalers"], **get_search_space_params), + inner_search_space = tpot2.config.get_search_space(["selectors","transformers","classifiers", "scalers","selectors_classification"], **get_search_space_params), ) else: search_space = tpot2.search_spaces.pipelines.GraphPipeline( root_search_space= tpot2.config.get_search_space("classifiers", **get_search_space_params), leaf_search_space = None, - inner_search_space = tpot2.config.get_search_space(["selectors","transformers","scalers"], **get_search_space_params), + inner_search_space = tpot2.config.get_search_space(["selectors","transformers","scalers","selectors_classification"], **get_search_space_params), ) From dbba6ee9114d70ec2154fb19ac4f1fa5d41eb1a3 Mon Sep 17 00:00:00 2001 From: perib Date: Fri, 3 May 2024 16:14:47 -0700 Subject: [PATCH 2/7] bug fixes, fixed unique_id type (non-iterable tuples), fixed graphpipeline mutation function --- tpot2/config/classifiers.py | 2 +- tpot2/config/get_configspace.py | 6 +- tpot2/config/regressors.py | 4 +- tpot2/config/tests/test_get_configspace.py | 4 +- tpot2/search_spaces/base.py | 21 +++ tpot2/search_spaces/nodes/estimator_node.py | 8 +- .../nodes/estimator_node_custom_sampler.py | 8 +- .../nodes/estimator_node_simple.py | 8 +- tpot2/search_spaces/nodes/fss_node.py | 3 +- .../nodes/genetic_feature_selection.py | 4 +- .../search_spaces/pipelines/dynamic_linear.py | 5 +- tpot2/search_spaces/pipelines/dynamicunion.py | 157 ++++++++++++++++++ tpot2/search_spaces/pipelines/graph.py | 79 +++++---- tpot2/search_spaces/pipelines/sequential.py | 71 +++++++- tpot2/search_spaces/pipelines/tree.py | 1 + tpot2/search_spaces/pipelines/union.py | 90 ++++++++++ tpot2/search_spaces/pipelines/wrapper.py | 17 +- tpot2/search_spaces/tuple_index.py | 25 +++ 18 files changed, 454 insertions(+), 59 deletions(-) create mode 100644 tpot2/search_spaces/pipelines/dynamicunion.py create mode 100644 tpot2/search_spaces/pipelines/union.py create mode 100644 tpot2/search_spaces/tuple_index.py diff --git a/tpot2/config/classifiers.py b/tpot2/config/classifiers.py index 4b9b8581..cad90a90 100644 --- a/tpot2/config/classifiers.py +++ b/tpot2/config/classifiers.py @@ -445,7 +445,7 @@ def HistGradientBoostingClassifier_hyperparameter_parser(params): if params['early_stop'] == 'off': - final_params['n_iter_no_change'] = 0 + # final_params['n_iter_no_change'] = 0 final_params['validation_fraction'] = None final_params['early_stopping'] = False elif params['early_stop'] == 'valid': diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py index c1762cfe..05a76920 100644 --- a/tpot2/config/get_configspace.py +++ b/tpot2/config/get_configspace.py @@ -27,6 +27,7 @@ from tpot2.builtin_modules import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer from tpot2.builtin_modules.genetic_encoders import DominantEncoder, RecessiveEncoder, HeterosisEncoder, UnderDominanceEncoder, OverDominanceEncoder from tpot2.builtin_modules import ZeroCount, ColumnOneHotEncoder +from tpot2.builtin_modules import Passthrough from sklearn.linear_model import SGDClassifier, LogisticRegression, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, ElasticNetCV, PassiveAggressiveClassifier, ARDRegression from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, ExtraTreesRegressor, ExtraTreesClassifier, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor,RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor from sklearn.neural_network import MLPClassifier, MLPRegressor @@ -53,6 +54,7 @@ PowerTransformer, QuantileTransformer,ARDRegression, QuadraticDiscriminantAnalysis, PassiveAggressiveClassifier, LinearDiscriminantAnalysis, DominantEncoder, RecessiveEncoder, HeterosisEncoder, UnderDominanceEncoder, OverDominanceEncoder, GaussianProcessClassifier, BaggingClassifier,LGBMRegressor, + Passthrough, ] @@ -147,6 +149,8 @@ def get_configspace(name, n_classes=3, n_samples=100, n_features=100, random_sta case "OverDominanceEncoder": return {} + case "Passthrough": + return {} #classifiers.py case "LinearDiscriminantAnalysis": @@ -335,7 +339,7 @@ def get_configspace(name, n_classes=3, n_samples=100, n_features=100, random_sta space = { - 'n': Float("n", bounds=(-1e3, 1e3)), + 'n': Float("n", bounds=(-1e2, 1e2)), } ) diff --git a/tpot2/config/regressors.py b/tpot2/config/regressors.py index 3dd1598a..e3b9b1d4 100644 --- a/tpot2/config/regressors.py +++ b/tpot2/config/regressors.py @@ -491,8 +491,8 @@ def HistGradientBoostingRegressor_hyperparameter_parser(params): if params['early_stop'] == 'off': - final_params['n_iter_no_change'] = 0 - final_params['validation_fraction'] = None + # final_params['n_iter_no_change'] = 0 + # final_params['validation_fraction'] = None final_params['early_stopping'] = False elif params['early_stop'] == 'valid': final_params['n_iter_no_change'] = params['n_iter_no_change'] diff --git a/tpot2/config/tests/test_get_configspace.py b/tpot2/config/tests/test_get_configspace.py index 15312fa8..ae9af09a 100644 --- a/tpot2/config/tests/test_get_configspace.py +++ b/tpot2/config/tests/test_get_configspace.py @@ -20,7 +20,7 @@ def test_loop_through_all_hyperparameters(): estnode_gen = tpot2.config.get_search_space(class_name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state) #generate 100 random hyperparameters and make sure they are all valid - for i in range(100): + for i in range(25): estnode = estnode_gen.generate() est = estnode.export_pipeline() @@ -37,6 +37,6 @@ def test_loop_through_groupnames(): estnode_gen = tpot2.config.get_search_space(class_name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state) #generate 10 random hyperparameters and make sure they are all valid - for i in range(100): + for i in range(25): estnode = estnode_gen.generate() est = estnode.export_pipeline() \ No newline at end of file diff --git a/tpot2/search_spaces/base.py b/tpot2/search_spaces/base.py index f3cabe76..9e453640 100644 --- a/tpot2/search_spaces/base.py +++ b/tpot2/search_spaces/base.py @@ -26,8 +26,29 @@ def export_pipeline(self) -> BaseEstimator: return def unique_id(self): + """ + Returns a unique identifier for the individual. Used for preventing duplicate individuals from being evaluated. + """ return self + #TODO currently TPOT2 population class manually uses the unique_id to generate the index for the population data frame. + #alternatively, the index could be the individual itself, with the __eq__ and __hash__ methods implemented. + + # Though this breaks the graphpipeline. When a mutation is called, it changes the __eq__ and __hash__ outputs. + # Since networkx uses the hash and eq to determine if a node is already in the graph, this causes the graph thing that + # This is a new node not in the graph. But this could be changed if when the graphpipeline mutates nodes, + # it "replaces" the existing node with the mutated node. This would require a change in the graphpipeline class. + + # def __eq__(self, other): + # return self.unique_id() == other.unique_id() + + # def __hash__(self): + # return hash(self.unique_id()) + + #number of components in the pipeline + def get_size(self): + return 1 + @final def export_flattened_graphpipeline(self, **graphpipeline_kwargs) -> tpot2.GraphPipeline: return flatten_to_graphpipeline(self.export_pipeline(), **graphpipeline_kwargs) diff --git a/tpot2/search_spaces/nodes/estimator_node.py b/tpot2/search_spaces/nodes/estimator_node.py index 8a35a17d..50d698f3 100644 --- a/tpot2/search_spaces/nodes/estimator_node.py +++ b/tpot2/search_spaces/nodes/estimator_node.py @@ -95,7 +95,13 @@ def export_pipeline(self, **kwargs): def unique_id(self): #return a dictionary of the method and the hyperparameters - return (self.method, str(tuple(sorted(list(self.hyperparameter_parser(self.hyperparameters).items()))))) + method_str = self.method.__name__ + params = list(self.hyperparameters.keys()) + params = sorted(params) + + id_str = f"{method_str}({', '.join([f'{param}={self.hyperparameters[param]}' for param in params])})" + + return id_str class EstimatorNode(SklearnIndividualGenerator): def __init__(self, method, space, hyperparameter_parser=default_hyperparameter_parser): diff --git a/tpot2/search_spaces/nodes/estimator_node_custom_sampler.py b/tpot2/search_spaces/nodes/estimator_node_custom_sampler.py index c53d4715..4ec76ef9 100644 --- a/tpot2/search_spaces/nodes/estimator_node_custom_sampler.py +++ b/tpot2/search_spaces/nodes/estimator_node_custom_sampler.py @@ -40,7 +40,13 @@ def export_pipeline(self, **kwargs): def unique_id(self): #return a dictionary of the method and the hyperparameters - return (self.method, self.hyperparameters) + method_str = self.method.__name__ + params = list(self.hyperparameters.keys()) + params = sorted(params) + + id_str = f"{method_str}({', '.join([f'{param}={self.hyperparameters[param]}' for param in params])})" + + return id_str class EstimatorNodeCustom(SklearnIndividualGenerator): def __init__(self, method : type, diff --git a/tpot2/search_spaces/nodes/estimator_node_simple.py b/tpot2/search_spaces/nodes/estimator_node_simple.py index eb658cab..934c899e 100644 --- a/tpot2/search_spaces/nodes/estimator_node_simple.py +++ b/tpot2/search_spaces/nodes/estimator_node_simple.py @@ -53,7 +53,13 @@ def export_pipeline(self, **kwargs): def unique_id(self): #return a dictionary of the method and the hyperparameters - return (self.method, self.hyperparameters) + method_str = self.method.__name__ + params = list(self.hyperparameters.keys()) + params = sorted(params) + + id_str = f"{method_str}({', '.join([f'{param}={self.hyperparameters[param]}' for param in params])})" + + return id_str class EstimatorNode(SklearnIndividualGenerator): def __init__(self, method, space): diff --git a/tpot2/search_spaces/nodes/fss_node.py b/tpot2/search_spaces/nodes/fss_node.py index b87275ad..4dda0d92 100644 --- a/tpot2/search_spaces/nodes/fss_node.py +++ b/tpot2/search_spaces/nodes/fss_node.py @@ -60,7 +60,8 @@ def export_pipeline(self): def unique_id(self): - return ("FSS", self.selected_subset_name) + id_str = "FeatureSetSelector({0})".format(self.selected_subset_name) + return id_str class FSSNode(SklearnIndividualGenerator): diff --git a/tpot2/search_spaces/nodes/genetic_feature_selection.py b/tpot2/search_spaces/nodes/genetic_feature_selection.py index ac604309..2f55c7d5 100644 --- a/tpot2/search_spaces/nodes/genetic_feature_selection.py +++ b/tpot2/search_spaces/nodes/genetic_feature_selection.py @@ -146,7 +146,9 @@ def export_pipeline(self): def unique_id(self): - return tuple(self.mask) + mask_idexes = np.where(self.mask)[0] + id_str = ','.join([str(i) for i in mask_idexes]) + return id_str class GeneticFeatureSelectorNode(SklearnIndividualGenerator): diff --git a/tpot2/search_spaces/pipelines/dynamic_linear.py b/tpot2/search_spaces/pipelines/dynamic_linear.py index 7408fe8b..20c1ea37 100644 --- a/tpot2/search_spaces/pipelines/dynamic_linear.py +++ b/tpot2/search_spaces/pipelines/dynamic_linear.py @@ -8,6 +8,7 @@ from ..base import SklearnIndividual, SklearnIndividualGenerator import copy +from ..tuple_index import TupleIndex class DynamicLinearPipelineIndividual(SklearnIndividual): # takes in a single search space. @@ -78,7 +79,9 @@ def export_pipeline(self, **graph_pipeline_args): return [step.export_pipeline(**graph_pipeline_args) for step in self.pipeline] def unique_id(self): - return tuple([step.unique_id() for step in self.pipeline]) + l = [step.unique_id() for step in self.pipeline] + l = ["DynamicLinearPipeline"] + l + return TupleIndex(tuple(l)) class DynamicLinearPipeline(SklearnIndividualGenerator): diff --git a/tpot2/search_spaces/pipelines/dynamicunion.py b/tpot2/search_spaces/pipelines/dynamicunion.py new file mode 100644 index 00000000..5fa0540d --- /dev/null +++ b/tpot2/search_spaces/pipelines/dynamicunion.py @@ -0,0 +1,157 @@ +import tpot2 +import numpy as np +import pandas as pd +import sklearn +from tpot2 import config +from typing import Generator, List, Tuple, Union +import random +from ..base import SklearnIndividual, SklearnIndividualGenerator +from ..tuple_index import TupleIndex + +class DynamicUnionPipelineIndividual(SklearnIndividual): + """ + Takes in one search space. + Will produce a FeatureUnion of up to max_estimators number of steps. + The output of the FeatureUnion will the all of the steps concatenated together. + + """ + + def __init__(self, search_space : SklearnIndividualGenerator, max_estimators=None, rng=None) -> None: + super().__init__() + self.search_space = search_space + + if max_estimators is None: + self.max_estimators = np.inf + else: + self.max_estimators = max_estimators + + self.pipeline = [] + + if self.max_estimators == np.inf: + init_max = 3 + else: + init_max = self.max_estimators + + rng = np.random.default_rng(rng) + + for _ in range(rng.integers(1, init_max)): + self.pipeline.append(self.search_space.generate(rng)) + + def mutate(self, rng=None): + rng = np.random.default_rng() + mutation_funcs = [self._mutate_add_step, self._mutate_remove_step, self._mutate_replace_step, self._mutate_inner_step] + rng.shuffle(mutation_funcs) + for mutation_func in mutation_funcs: + if mutation_func(rng): + return True + + def _mutate_add_step(self, rng): + rng = np.random.default_rng() + if len(self.pipeline) < self.max_estimators: + self.pipeline.append(self.search_space.generate(rng)) + return True + return False + + def _mutate_remove_step(self, rng): + rng = np.random.default_rng() + if len(self.pipeline) > 1: + self.pipeline.pop(rng.integers(0, len(self.pipeline))) + return True + return False + + def _mutate_replace_step(self, rng): + rng = np.random.default_rng() + idx = rng.integers(0, len(self.pipeline)) + self.pipeline[idx] = self.search_space.generate(rng) + return True + + def _mutate_inner_step(self, rng): + rng = np.random.default_rng() + indexes = rng.random(len(self.pipeline)) < 0.5 + indexes = np.where(indexes)[0] + mutated = False + if len(indexes) > 0: + for idx in indexes: + if self.pipeline[idx].mutate(rng): + mutated = True + else: + mutated = self.pipeline[rng.integers(0, len(self.pipeline))].mutate(rng) + + return mutated + + + def crossover(self, other, rng=None): + rng = np.random.default_rng() + + cx_funcs = [self._crossover_swap_random_steps, self._crossover_inner_step] + rng.shuffle(cx_funcs) + for cx_func in cx_funcs: + if cx_func(other, rng): + return True + + return False + + def _crossover_swap_step(self, other, rng): + rng = np.random.default_rng() + idx = rng.integers(1,len(self.pipeline)) + idx2 = rng.integers(1,len(other.pipeline)) + + self.pipeline[idx], other.pipeline[idx2] = other.pipeline[idx2], self.pipeline[idx] + # self.pipeline[idx] = other.pipeline[idx2] + return True + + def _crossover_swap_random_steps(self, other, rng): + rng = np.random.default_rng() + + max_steps = int(min(len(self.pipeline), len(other.pipeline))/2) + max_steps = max(max_steps, 1) + + n_steps_to_swap = rng.integers(1, max_steps) + + other_indexes_to_take = rng.choice(len(other.pipeline), n_steps_to_swap, replace=False) + self_indexes_to_replace = rng.choice(len(self.pipeline), n_steps_to_swap, replace=False) + + self.pipeline[self_indexes_to_replace], other.pipeline[other_indexes_to_take] = other.pipeline[other_indexes_to_take], self.pipeline[self_indexes_to_replace] + return True + + + + def _crossover_inner_step(self, other, rng): + rng = np.random.default_rng() + + #randomly select pairs of steps to crossover + indexes = list(range(1, len(self.pipeline))) + other_indexes = list(range(1, len(other.pipeline))) + #shuffle + rng.shuffle(indexes) + rng.shuffle(other_indexes) + + crossover_success = False + for idx, other_idx in zip(indexes, other_indexes): + if self.pipeline[idx].crossover(other.pipeline[other_idx], rng): + crossover_success = True + + return crossover_success + + def export_pipeline(self): + return sklearn.pipeline.make_pipeline(*[step.export_pipeline() for step in self.pipeline]) + + def unique_id(self): + l = [step.unique_id() for step in self.pipeline] + # if all items are strings, then sort them + if all([isinstance(x, str) for x in l]): + l.sort() + l = ["FeatureUnion"] + l + return TupleIndex(tuple(l)) + + +class DynamicUnionPipeline(SklearnIndividualGenerator): + def __init__(self, search_spaces : List[SklearnIndividualGenerator] ) -> None: + """ + Takes in a list of search spaces. will produce a pipeline of Sequential length. Each step in the pipeline will correspond to the the search space provided in the same index. + """ + + self.search_spaces = search_spaces + + def generate(self, rng=None): + return DynamicUnionPipelineIndividual(self.search_spaces) \ No newline at end of file diff --git a/tpot2/search_spaces/pipelines/graph.py b/tpot2/search_spaces/pipelines/graph.py index dcd99511..36db8082 100644 --- a/tpot2/search_spaces/pipelines/graph.py +++ b/tpot2/search_spaces/pipelines/graph.py @@ -11,6 +11,7 @@ from typing import Union, Callable import sklearn from functools import partial +import random class GraphPipelineIndividual(SklearnIndividual): """ @@ -96,8 +97,13 @@ def __init__( self.graph.add_node(self.leaf) self.graph.add_edge(self.root, self.leaf) - self.mutate_methods_list = [self._mutate_insert_leaf, self._mutate_insert_inner_node, self._mutate_remove_node, self._mutate_node] - self.crossover_methods_list = [self._crossover_swap_branch,]#[self._crossover_swap_branch, self._crossover_swap_node, self._crossover_take_branch] #TODO self._crossover_nodes, + if self.inner_search_space is None and self.leaf_search_space is None: + self.mutate_methods_list = [self._mutate_node] + self.crossover_methods_list = [self._crossover_swap_branch,]#[self._crossover_swap_branch, self._crossover_swap_node, self._crossover_take_branch] #TODO self._crossover_nodes, + + else: + self.mutate_methods_list = [self._mutate_insert_leaf, self._mutate_insert_inner_node, self._mutate_remove_node, self._mutate_node] + self.crossover_methods_list = [self._crossover_swap_branch,]#[self._crossover_swap_branch, self._crossover_swap_node, self._crossover_take_branch] #TODO self._crossover_nodes, self.merge_duplicated_nodes_toggle = True @@ -106,34 +112,34 @@ def __init__( def mutate(self, rng=None): rng = np.random.default_rng(rng) - rng.shuffle(self.mutate_methods_list) - for mutate_method in self.mutate_methods_list: - if mutate_method(rng=rng): - - if self.merge_duplicated_nodes_toggle: - self._merge_duplicated_nodes() + for i in range(0,random.randint(1,15)): + rng.shuffle(self.mutate_methods_list) + for mutate_method in self.mutate_methods_list: + if mutate_method(rng=rng): + + if self.merge_duplicated_nodes_toggle: + self._merge_duplicated_nodes() - if self.__debug: - print(mutate_method) + if self.__debug: + print(mutate_method) - if self.root not in self.graph.nodes: - print('lost root something went wrong with ', mutate_method) + if self.root not in self.graph.nodes: + print('lost root something went wrong with ', mutate_method) - if len(self.graph.predecessors(self.root)) > 0: - print('root has parents ', mutate_method) + if len(self.graph.predecessors(self.root)) > 0: + print('root has parents ', mutate_method) - if any([n in nx.ancestors(self.graph,n) for n in self.graph.nodes]): - print('a node is connecting to itself...') + if any([n in nx.ancestors(self.graph,n) for n in self.graph.nodes]): + print('a node is connecting to itself...') - if self.__debug: - try: - nx.find_cycle(self.graph) - print('something went wrong with ', mutate_method) - except: - pass + if self.__debug: + try: + nx.find_cycle(self.graph) + print('something went wrong with ', mutate_method) + except: + pass - self.graphkey = None - return True + self.graphkey = None return False @@ -168,6 +174,9 @@ def _mutate_insert_leaf(self, rng=None): return False def _mutate_insert_inner_node(self, rng=None): + """ + Finds an edge in the graph and inserts a new node between the two nodes. Removes the edge between the two nodes. + """ rng = np.random.default_rng(rng) if self.max_size > self.graph.number_of_nodes(): sorted_nodes_list = list(self.graph.nodes) @@ -184,12 +193,12 @@ def _mutate_insert_inner_node(self, rng=None): if len(list(nx.descendants(self.graph,node))) ==0 : continue - new_node = self.inner_search_space.generate(rng) + new_node = self.inner_search_space.generate(rng) - self.graph.add_node(new_node) - self.graph.add_edges_from([(node, new_node), (new_node, child_node)]) - self.graph.remove_edge(node, child_node) - return True + self.graph.add_node(new_node) + self.graph.add_edges_from([(node, new_node), (new_node, child_node)]) + self.graph.remove_edge(node, child_node) + return True return False @@ -287,6 +296,10 @@ def _mutate_add_edge(self, rng=None): return False def _mutate_insert_bypass_node(self, rng=None): + """ + Pick two nodes (doesn't necessarily need to be connected). Create a new node. connect one node to the new node and the new node to the other node. + Does not remove any edges. + """ rng = np.random.default_rng(rng) if self.max_size > self.graph.number_of_nodes(): sorted_nodes_list = list(self.graph.nodes) @@ -301,11 +314,11 @@ def _mutate_insert_bypass_node(self, rng=None): if len(list(nx.descendants(self.graph,node))) ==0 : continue - new_node = self.inner_search_space.generate(rng) + new_node = self.inner_search_space.generate(rng) - self.graph.add_node(new_node) - self.graph.add_edges_from([(node, new_node), (new_node, child_node)]) - return True + self.graph.add_node(new_node) + self.graph.add_edges_from([(node, new_node), (new_node, child_node)]) + return True return False diff --git a/tpot2/search_spaces/pipelines/sequential.py b/tpot2/search_spaces/pipelines/sequential.py index 4459a284..b880904a 100644 --- a/tpot2/search_spaces/pipelines/sequential.py +++ b/tpot2/search_spaces/pipelines/sequential.py @@ -6,21 +6,22 @@ from typing import Generator, List, Tuple, Union import random from ..base import SklearnIndividual, SklearnIndividualGenerator +from ..tuple_index import TupleIndex class SequentialPipelineIndividual(SklearnIndividual): # takes in a list of search spaces. each space is a list of SklearnIndividualGenerators. # will produce a pipeline of Sequential length. Each step in the pipeline will correspond to the the search space provided in the same index. - def __init__(self, search_spaces : List[SklearnIndividualGenerator] ) -> None: + def __init__(self, search_spaces : List[SklearnIndividualGenerator], rng=None) -> None: super().__init__() self.search_spaces = search_spaces - self.pipeline = self._generate_pipeline() + self.pipeline = [] - def _generate_pipeline(self, rng=None): - pipeline = [] for space in self.search_spaces: - pipeline.append(space.generate(rng)) - return pipeline + self.pipeline.append(space.generate(rng)) + + self.pipeline = np.array(self.pipeline) + def mutate(self, rng=None): rng = np.random.default_rng() @@ -30,7 +31,23 @@ def mutate(self, rng=None): def crossover(self, other, rng=None): #swap a random step in the pipeline with the corresponding step in the other pipeline + if len(self.pipeline) != len(other.pipeline): + return False + + if len(self.pipeline) < 2: + return False + rng = np.random.default_rng() + cx_funcs = [self._crossover_swap_random_steps, self._crossover_swap_segment, self._crossover_inner_step] + + rng.shuffle(cx_funcs) + for cx_func in cx_funcs: + if cx_func(other, rng): + return True + + return False + + def _crossover_swap_step(self, other, rng): if len(self.pipeline) != len(other.pipeline): return False @@ -43,11 +60,51 @@ def crossover(self, other, rng=None): self.pipeline[idx], other.pipeline[idx] = other.pipeline[idx], self.pipeline[idx] return True + def _crossover_swap_random_steps(self, other, rng): + rng = np.random.default_rng() + #selet steps idxs with probability 0.5 + idxs = rng.random(len(self.pipeline)) < 0.5 + #swap steps + self.pipeline[idxs], other.pipeline[idxs] = other.pipeline[idxs], self.pipeline[idxs] + + return True + + def _crossover_swap_segment(self, other, rng): + if len(self.pipeline) != len(other.pipeline): + return False + + if len(self.pipeline) < 2: + return False + + rng = np.random.default_rng() + idx = rng.integers(1,len(self.pipeline)) + + left = rng.choice([True, False]) + if left: + self.pipeline[:idx], other.pipeline[:idx] = other.pipeline[:idx], self.pipeline[:idx] + else: + self.pipeline[idx:], other.pipeline[idx:] = other.pipeline[idx:], self.pipeline[idx:] + + return True + + def _crossover_inner_step(self, other, rng): + rng = np.random.default_rng() + + crossover_success = False + for idx in range(len(self.pipeline)): + if rng.random() < 0.5: + if self.pipeline[idx].crossover(other.pipeline[idx], rng): + crossover_success = True + + return crossover_success + def export_pipeline(self): return sklearn.pipeline.make_pipeline(*[step.export_pipeline() for step in self.pipeline]) def unique_id(self): - return tuple([step.unique_id() for step in self.pipeline]) + l = [step.unique_id() for step in self.pipeline] + l = ["SequentialPipeline"] + l + return TupleIndex(tuple(l)) class SequentialPipeline(SklearnIndividualGenerator): diff --git a/tpot2/search_spaces/pipelines/tree.py b/tpot2/search_spaces/pipelines/tree.py index 92825ef4..b2e642e7 100644 --- a/tpot2/search_spaces/pipelines/tree.py +++ b/tpot2/search_spaces/pipelines/tree.py @@ -22,6 +22,7 @@ def __init__(self, self.crossover_methods_list = [self._crossover_swap_branch, self._crossover_swap_node, self._crossover_nodes] self.mutate_methods_list = [self._mutate_insert_leaf, self._mutate_insert_inner_node, self._mutate_remove_node, self._mutate_node] + self.merge_duplicated_nodes_toggle = False diff --git a/tpot2/search_spaces/pipelines/union.py b/tpot2/search_spaces/pipelines/union.py new file mode 100644 index 00000000..6f165e27 --- /dev/null +++ b/tpot2/search_spaces/pipelines/union.py @@ -0,0 +1,90 @@ +import tpot2 +import numpy as np +import pandas as pd +import sklearn +from tpot2 import config +from typing import Generator, List, Tuple, Union +import random +from ..base import SklearnIndividual, SklearnIndividualGenerator +from ..tuple_index import TupleIndex + +class UnionPipelineIndividual(SklearnIndividual): + """ + Takes in a list of search spaces. each space is a list of SklearnIndividualGenerators. + Will produce a FeatureUnion pipeline. Each step in the pipeline will correspond to the the search space provided in the same index. + The resulting pipeline will be a FeatureUnion of the steps in the pipeline. + + """ + + def __init__(self, search_spaces : List[SklearnIndividualGenerator], rng=None) -> None: + super().__init__() + self.search_spaces = search_spaces + + self.pipeline = [] + for space in self.search_spaces: + self.pipeline.append(space.generate(rng)) + + def mutate(self, rng=None): + rng = np.random.default_rng() + step = rng.choice(self.pipeline) + return step.mutate(rng) + + + def crossover(self, other, rng=None): + #swap a random step in the pipeline with the corresponding step in the other pipeline + rng = np.random.default_rng() + + cx_funcs = [self._crossover_swap_random_steps, self._crossover_inner_step] + rng.shuffle(cx_funcs) + for cx_func in cx_funcs: + if cx_func(other, rng): + return True + + return False + + def _crossover_swap_step(self, other, rng): + rng = np.random.default_rng() + idx = rng.integers(1,len(self.pipeline)) + + self.pipeline[idx], other.pipeline[idx] = other.pipeline[idx], self.pipeline[idx] + return True + + def _crossover_swap_random_steps(self, other, rng): + rng = np.random.default_rng() + #selet steps idxs with probability 0.5 + idxs = rng.random(len(self.pipeline)) < 0.5 + #swap steps + self.pipeline[idxs], other.pipeline[idxs] = other.pipeline[idxs], self.pipeline[idxs] + + return True + + def _crossover_inner_step(self, other, rng): + rng = np.random.default_rng() + + crossover_success = False + for idx in range(len(self.pipeline)): + if rng.random() < 0.5: + if self.pipeline[idx].crossover(other.pipeline[idx], rng): + crossover_success = True + + return crossover_success + + def export_pipeline(self): + return sklearn.pipeline.FeatureUnion(transformer_list=[step.export_pipeline() for step in self.pipeline]) + + def unique_id(self): + l = [step.unique_id() for step in self.pipeline] + l = ["FeatureUnion"] + l + return TupleIndex(tuple(l)) + + +class UnionPipeline(SklearnIndividualGenerator): + def __init__(self, search_spaces : List[SklearnIndividualGenerator] ) -> None: + """ + Takes in a list of search spaces. will produce a pipeline of Sequential length. Each step in the pipeline will correspond to the the search space provided in the same index. + """ + + self.search_spaces = search_spaces + + def generate(self, rng=None): + return UnionPipelineIndividual(self.search_spaces) \ No newline at end of file diff --git a/tpot2/search_spaces/pipelines/wrapper.py b/tpot2/search_spaces/pipelines/wrapper.py index 26e87b58..14625f9e 100644 --- a/tpot2/search_spaces/pipelines/wrapper.py +++ b/tpot2/search_spaces/pipelines/wrapper.py @@ -7,7 +7,7 @@ import random from ..base import SklearnIndividual, SklearnIndividualGenerator from ConfigSpace import ConfigurationSpace - +from ..tuple_index import TupleIndex class WrapperPipelineIndividual(SklearnIndividual): def __init__( @@ -65,14 +65,17 @@ def export_pipeline(self): wrapped_est = self.method(est, **final_params) return wrapped_est - def unique_id(self): - if self.hyperparameters_parser is not None: - final_params = self.hyperparameters_parser(self.hyperparameters) - else: - final_params = self.hyperparameters - return (self.method, str(tuple(sorted(list(final_params.items())))) ,self.node.unique_id()) + def unique_id(self): + #return a dictionary of the method and the hyperparameters + method_str = self.method.__name__ + params = list(self.hyperparameters.keys()) + params = sorted(params) + + id_str = f"{method_str}({', '.join([f'{param}={self.hyperparameters[param]}' for param in params])})" + + return TupleIndex(("WrapperPipeline", id_str, self.node.unique_id())) class WrapperPipeline(SklearnIndividualGenerator): diff --git a/tpot2/search_spaces/tuple_index.py b/tpot2/search_spaces/tuple_index.py new file mode 100644 index 00000000..adfbbb2c --- /dev/null +++ b/tpot2/search_spaces/tuple_index.py @@ -0,0 +1,25 @@ +import numpy as np + +class TupleIndex(): + """ + TPOT2 uses tuples to create a unique id for some pipeline search spaces. However, tuples sometimes don't interact correctly with pandas indexes. + This class is a wrapper around a tuple that allows it to be used as a key in a dictionary, without it being an itereable. + + An alternative could be to make unique id return a string, but this would not work with graphpipelines, which require a special object. + This class allows linear pipelines to contain graph pipelines while still being able to be used as a key in a dictionary. + + """ + def __init__(self, tup): + self.tup = tup + + def __eq__(self,other) -> bool: + return self.tup == other + + def __hash__(self) -> int: + return self.tup.__hash__() + + def __str__(self) -> str: + return self.tup.__str__() + + def __repr__(self) -> str: + return self.tup.__repr__() \ No newline at end of file From 3c6e8d2f9dc776c46153d81f8267dfdebdb8b775 Mon Sep 17 00:00:00 2001 From: perib Date: Fri, 3 May 2024 17:16:13 -0700 Subject: [PATCH 3/7] make more mutation/cx functions available for gp --- tpot2/search_spaces/pipelines/dynamicunion.py | 1 + tpot2/search_spaces/pipelines/graph.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tpot2/search_spaces/pipelines/dynamicunion.py b/tpot2/search_spaces/pipelines/dynamicunion.py index 5fa0540d..3eee8ced 100644 --- a/tpot2/search_spaces/pipelines/dynamicunion.py +++ b/tpot2/search_spaces/pipelines/dynamicunion.py @@ -65,6 +65,7 @@ def _mutate_replace_step(self, rng): self.pipeline[idx] = self.search_space.generate(rng) return True + #TODO mutate one step or multiple? def _mutate_inner_step(self, rng): rng = np.random.default_rng() indexes = rng.random(len(self.pipeline)) < 0.5 diff --git a/tpot2/search_spaces/pipelines/graph.py b/tpot2/search_spaces/pipelines/graph.py index 36db8082..419fe2b1 100644 --- a/tpot2/search_spaces/pipelines/graph.py +++ b/tpot2/search_spaces/pipelines/graph.py @@ -102,8 +102,8 @@ def __init__( self.crossover_methods_list = [self._crossover_swap_branch,]#[self._crossover_swap_branch, self._crossover_swap_node, self._crossover_take_branch] #TODO self._crossover_nodes, else: - self.mutate_methods_list = [self._mutate_insert_leaf, self._mutate_insert_inner_node, self._mutate_remove_node, self._mutate_node] - self.crossover_methods_list = [self._crossover_swap_branch,]#[self._crossover_swap_branch, self._crossover_swap_node, self._crossover_take_branch] #TODO self._crossover_nodes, + self.mutate_methods_list = [self._mutate_insert_leaf, self._mutate_insert_inner_node, self._mutate_remove_node, self._mutate_node, self._mutate_insert_bypass_node] + self.crossover_methods_list = [self._crossover_swap_branch, self._crossover_nodes, self._crossover_take_branch ]#[self._crossover_swap_branch, self._crossover_swap_node, self._crossover_take_branch] #TODO self._crossover_nodes, self.merge_duplicated_nodes_toggle = True From 5b8f53fc1113ba7fb0defcbce7879bf76206510e Mon Sep 17 00:00:00 2001 From: perib Date: Fri, 3 May 2024 18:04:41 -0700 Subject: [PATCH 4/7] fixed crossover between different node types causing a crash. now different node types return false --- tpot2/search_spaces/base.py | 13 +++++- tpot2/search_spaces/nodes/estimator_node.py | 2 +- .../nodes/estimator_node_custom_sampler.py | 5 ++- .../nodes/estimator_node_simple.py | 2 +- tpot2/search_spaces/nodes/fss_node.py | 2 +- .../nodes/genetic_feature_selection.py | 2 +- tpot2/search_spaces/pipelines/choice.py | 2 +- .../search_spaces/pipelines/dynamic_linear.py | 2 +- tpot2/search_spaces/pipelines/dynamicunion.py | 2 +- tpot2/search_spaces/pipelines/graph.py | 2 +- tpot2/search_spaces/pipelines/sequential.py | 4 +- tpot2/search_spaces/pipelines/union.py | 2 +- tpot2/search_spaces/pipelines/wrapper.py | 2 +- .../search_spaces/tests/test_search_spaces.py | 44 +++++++++++++++++++ 14 files changed, 71 insertions(+), 15 deletions(-) create mode 100644 tpot2/search_spaces/tests/test_search_spaces.py diff --git a/tpot2/search_spaces/base.py b/tpot2/search_spaces/base.py index 9e453640..2977d491 100644 --- a/tpot2/search_spaces/base.py +++ b/tpot2/search_spaces/base.py @@ -10,6 +10,10 @@ import networkx as nx from . import graph_utils from typing import final +from abc import ABC, abstractmethod + + + class SklearnIndividual(tpot2.BaseIndividual): @@ -18,8 +22,15 @@ def __init__(self,) -> None: def mutate(self, rng=None): return + + @final + def crossover(self, other, rng=None, **kwargs): + if not isinstance(other, type(self)): + return False + return self._crossover(other, rng=rng, **kwargs) - def crossover(self, other, rng=None): + @abstractmethod + def _crossover(self, other, rng=None): return def export_pipeline(self) -> BaseEstimator: diff --git a/tpot2/search_spaces/nodes/estimator_node.py b/tpot2/search_spaces/nodes/estimator_node.py index 50d698f3..4724405e 100644 --- a/tpot2/search_spaces/nodes/estimator_node.py +++ b/tpot2/search_spaces/nodes/estimator_node.py @@ -60,7 +60,7 @@ def mutate(self, rng=None): self.check_hyperparameters_for_None() return True - def crossover(self, other, rng=None): + def _crossover(self, other, rng=None): if isinstance(self.space, dict): return False diff --git a/tpot2/search_spaces/nodes/estimator_node_custom_sampler.py b/tpot2/search_spaces/nodes/estimator_node_custom_sampler.py index 4ec76ef9..93a55a4e 100644 --- a/tpot2/search_spaces/nodes/estimator_node_custom_sampler.py +++ b/tpot2/search_spaces/nodes/estimator_node_custom_sampler.py @@ -6,9 +6,10 @@ from tpot2 import config from typing import Generator, List, Tuple, Union import random -from ..base import SklearnIndividual, SklearnIndividualGenerator +from ..base import SklearnIndividual, SklearnIndividualGenerator, check_same_subclass from ConfigSpace import ConfigurationSpace + class EstimatorNodeCustomIndividual(SklearnIndividual): def __init__(self, method: type, sample_func : callable, @@ -24,7 +25,7 @@ def mutate(self, rng=None): self.hyperparameters = self.sample_func(rng) return True - def crossover(self, other, rng=None): + def _crossover(self, other, rng=None): rng = np.random.default_rng(rng) if self.method != other.method: return False diff --git a/tpot2/search_spaces/nodes/estimator_node_simple.py b/tpot2/search_spaces/nodes/estimator_node_simple.py index 934c899e..8063526a 100644 --- a/tpot2/search_spaces/nodes/estimator_node_simple.py +++ b/tpot2/search_spaces/nodes/estimator_node_simple.py @@ -37,7 +37,7 @@ def _mutate_hyperparameters(self, rng=None): return True - def crossover(self, other, rng=None): + def _crossover(self, other, rng=None): rng = np.random.default_rng(rng) if self.method != other.method: return False diff --git a/tpot2/search_spaces/nodes/fss_node.py b/tpot2/search_spaces/nodes/fss_node.py index 4dda0d92..46aef024 100644 --- a/tpot2/search_spaces/nodes/fss_node.py +++ b/tpot2/search_spaces/nodes/fss_node.py @@ -51,7 +51,7 @@ def mutate(self, rng=None): self.sel_subset = self.subset_dict[self.selected_subset_name] - def crossover(self, other, rng=None): + def _crossover(self, other, rng=None): self.selected_subset_name = other.selected_subset_name self.sel_subset = other.sel_subset diff --git a/tpot2/search_spaces/nodes/genetic_feature_selection.py b/tpot2/search_spaces/nodes/genetic_feature_selection.py index 2f55c7d5..13352d4b 100644 --- a/tpot2/search_spaces/nodes/genetic_feature_selection.py +++ b/tpot2/search_spaces/nodes/genetic_feature_selection.py @@ -64,7 +64,7 @@ def mutate(self, rng=None): return rng.choice(self.mutation_list)(rng) - def crossover(self, other, rng=None): + def _crossover(self, other, rng=None): rng = np.random.default_rng(rng) if rng.uniform() < self.crossover_rate_rate: diff --git a/tpot2/search_spaces/pipelines/choice.py b/tpot2/search_spaces/pipelines/choice.py index 9afcbb80..d86b26ec 100644 --- a/tpot2/search_spaces/pipelines/choice.py +++ b/tpot2/search_spaces/pipelines/choice.py @@ -29,7 +29,7 @@ def _mutate_select_new_node(self, rng=None): def _mutate_node(self, rng=None): return self.node.mutate(rng) - def crossover(self, other, rng=None): + def _crossover(self, other, rng=None): return self.node.crossover(other.node, rng) def export_pipeline(self): diff --git a/tpot2/search_spaces/pipelines/dynamic_linear.py b/tpot2/search_spaces/pipelines/dynamic_linear.py index 20c1ea37..33c1f670 100644 --- a/tpot2/search_spaces/pipelines/dynamic_linear.py +++ b/tpot2/search_spaces/pipelines/dynamic_linear.py @@ -63,7 +63,7 @@ def _mutate_step(self, rng=None): return step.mutate(rng) - def crossover(self, other, rng=None): + def _crossover(self, other, rng=None): rng = np.random.default_rng() if len(self.pipeline) < 2 or len(other.pipeline) < 2: diff --git a/tpot2/search_spaces/pipelines/dynamicunion.py b/tpot2/search_spaces/pipelines/dynamicunion.py index 3eee8ced..d5cd9b37 100644 --- a/tpot2/search_spaces/pipelines/dynamicunion.py +++ b/tpot2/search_spaces/pipelines/dynamicunion.py @@ -81,7 +81,7 @@ def _mutate_inner_step(self, rng): return mutated - def crossover(self, other, rng=None): + def _crossover(self, other, rng=None): rng = np.random.default_rng() cx_funcs = [self._crossover_swap_random_steps, self._crossover_inner_step] diff --git a/tpot2/search_spaces/pipelines/graph.py b/tpot2/search_spaces/pipelines/graph.py index 419fe2b1..b48c364d 100644 --- a/tpot2/search_spaces/pipelines/graph.py +++ b/tpot2/search_spaces/pipelines/graph.py @@ -323,7 +323,7 @@ def _mutate_insert_bypass_node(self, rng=None): return False - def crossover(self, ind2, rng=None): + def _crossover(self, ind2, rng=None): ''' self is the first individual, ind2 is the second individual If crossover_same_depth, it will select graphindividuals at the same recursive depth. diff --git a/tpot2/search_spaces/pipelines/sequential.py b/tpot2/search_spaces/pipelines/sequential.py index b880904a..8667cd5f 100644 --- a/tpot2/search_spaces/pipelines/sequential.py +++ b/tpot2/search_spaces/pipelines/sequential.py @@ -22,14 +22,14 @@ def __init__(self, search_spaces : List[SklearnIndividualGenerator], rng=None) - self.pipeline = np.array(self.pipeline) - + #TODO, mutate all steps or just one? def mutate(self, rng=None): rng = np.random.default_rng() step = rng.choice(self.pipeline) return step.mutate(rng) - def crossover(self, other, rng=None): + def _crossover(self, other, rng=None): #swap a random step in the pipeline with the corresponding step in the other pipeline if len(self.pipeline) != len(other.pipeline): return False diff --git a/tpot2/search_spaces/pipelines/union.py b/tpot2/search_spaces/pipelines/union.py index 6f165e27..c2071bdc 100644 --- a/tpot2/search_spaces/pipelines/union.py +++ b/tpot2/search_spaces/pipelines/union.py @@ -30,7 +30,7 @@ def mutate(self, rng=None): return step.mutate(rng) - def crossover(self, other, rng=None): + def _crossover(self, other, rng=None): #swap a random step in the pipeline with the corresponding step in the other pipeline rng = np.random.default_rng() diff --git a/tpot2/search_spaces/pipelines/wrapper.py b/tpot2/search_spaces/pipelines/wrapper.py index 14625f9e..32f78a2e 100644 --- a/tpot2/search_spaces/pipelines/wrapper.py +++ b/tpot2/search_spaces/pipelines/wrapper.py @@ -51,7 +51,7 @@ def _mutate_hyperparameters(self, rng=None): def _mutate_node(self, rng=None): return self.node.mutate(rng) - def crossover(self, other, rng=None): + def _crossover(self, other, rng=None): return self.node.crossover(other.node, rng) def export_pipeline(self): diff --git a/tpot2/search_spaces/tests/test_search_spaces.py b/tpot2/search_spaces/tests/test_search_spaces.py new file mode 100644 index 00000000..6baf3ce3 --- /dev/null +++ b/tpot2/search_spaces/tests/test_search_spaces.py @@ -0,0 +1,44 @@ +# Test all nodes have all dictionaries +import pytest +import tpot2 + +import tpot2 +from ConfigSpace import ConfigurationSpace +from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal +from sklearn.neighbors import KNeighborsClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier +from sklearn.preprocessing import StandardScaler + + +def test_EstimatorNodeCrossover(): + knn_configspace = {} + standard_scaler_configspace = {} + + knn_node = tpot2.search_spaces.nodes.EstimatorNode( + method = KNeighborsClassifier, + space = knn_configspace, + ) + + knnind1 = knn_node.generate() + knnind2 = knn_node.generate() + + for i in range(0,10): + knnind1.mutate() + knnind2.mutate() + knnind1.crossover(knnind2) + + +def test_ValueError_different_types(): + knn_node = tpot2.config.get_search_space(["KNeighborsClassifier"]) + sfm_wrapper_node = tpot2.config.get_search_space(["SelectFromModel_classification"]) + + for i in range(10): + ind1 = knn_node.generate() + ind2 = sfm_wrapper_node.generate() + assert not ind1.crossover(ind2) + assert not ind2.crossover(ind1) + +if __name__ == "__main__": + test_EstimatorNodeCrossover() + test_ValueError_different_types() \ No newline at end of file From 9338db7df0d8f97590f0acd830036d97b21dc743 Mon Sep 17 00:00:00 2001 From: perib Date: Fri, 10 May 2024 17:04:17 -0700 Subject: [PATCH 5/7] unions, wrapping classifiers/regressors as transformers, multiple options in get_search_space now get flattened to have equal probability. --- tpot2/builtin_modules/__init__.py | 3 +- tpot2/builtin_modules/estimatortransformer.py | 121 ++++++++++++++++++ tpot2/config/get_configspace.py | 12 +- tpot2/search_spaces/pipelines/__init__.py | 4 +- tpot2/search_spaces/pipelines/dynamicunion.py | 2 +- tpot2/search_spaces/pipelines/union.py | 2 +- tpot2/search_spaces/pipelines/wrapper.py | 17 ++- 7 files changed, 149 insertions(+), 12 deletions(-) create mode 100644 tpot2/builtin_modules/estimatortransformer.py diff --git a/tpot2/builtin_modules/__init__.py b/tpot2/builtin_modules/__init__.py index 730d76a0..32207798 100644 --- a/tpot2/builtin_modules/__init__.py +++ b/tpot2/builtin_modules/__init__.py @@ -5,4 +5,5 @@ from .arithmetictransformer import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer from .passthrough import Passthrough from .imputer import ColumnSimpleImputer -from .selector_wrappers import RFE_ExtraTreesClassifier, SelectFromModel_ExtraTreesClassifier, RFE_ExtraTreesRegressor, SelectFromModel_ExtraTreesRegressor \ No newline at end of file +from .selector_wrappers import RFE_ExtraTreesClassifier, SelectFromModel_ExtraTreesClassifier, RFE_ExtraTreesRegressor, SelectFromModel_ExtraTreesRegressor +from .estimatortransformer import EstimatorTransformer \ No newline at end of file diff --git a/tpot2/builtin_modules/estimatortransformer.py b/tpot2/builtin_modules/estimatortransformer.py new file mode 100644 index 00000000..839e679c --- /dev/null +++ b/tpot2/builtin_modules/estimatortransformer.py @@ -0,0 +1,121 @@ +from numpy import ndarray +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.model_selection import cross_val_predict +from sklearn.utils.validation import check_is_fitted +from sklearn.utils.metaestimators import available_if +import numpy as np +from sklearn.utils.validation import check_is_fitted + +class EstimatorTransformer(BaseEstimator, TransformerMixin): + def __init__(self, estimator, method='auto', passthrough=False, cross_val_predict_cv=0): + self.estimator = estimator + self.method = method + self.passthrough = passthrough + self.cross_val_predict_cv = cross_val_predict_cv + + def fit(self, X, y=None): + return self.estimator.fit(X, y) + + def transform(self, X): + if self.method == 'auto': + if hasattr(self.estimator, 'predict_proba'): + method = 'predict_proba' + elif hasattr(self.estimator, 'decision_function'): + method = 'decision_function' + elif hasattr(self.estimator, 'predict'): + method = 'predict' + else: + raise ValueError('Estimator has no valid method') + else: + method = self.method + + output = getattr(self.estimator, method)(X) + output=np.array(output) + + if len(output.shape) == 1: + output = output.reshape(-1,1) + + if self.passthrough: + return np.hstack((output, X)) + else: + return output + + + + def fit_transform(self, X, y=None): + self.estimator.fit(X,y) + + if self.method == 'auto': + if hasattr(self.estimator, 'predict_proba'): + method = 'predict_proba' + elif hasattr(self.estimator, 'decision_function'): + method = 'decision_function' + elif hasattr(self.estimator, 'predict'): + method = 'predict' + else: + raise ValueError('Estimator has no valid method') + else: + method = self.method + + if self.cross_val_predict_cv > 0: + output = cross_val_predict(self.estimator, X, y=y, cv=self.cross_val_predict_cv) + + else: + output = getattr(self.estimator, method)(X) + #reshape if needed + + if len(output.shape) == 1: + output = output.reshape(-1,1) + + output=np.array(output) + if self.passthrough: + return np.hstack((output, X)) + else: + return output + + def _estimator_has(attr): + '''Check if we can delegate a method to the underlying estimator. + First, we check the first fitted final estimator if available, otherwise we + check the unfitted final estimator. + ''' + return lambda self: (self.estimator is not None and + hasattr(self.estimator, attr) + ) + + @available_if(_estimator_has('predict')) + def predict(self, X, **predict_params): + check_is_fitted(self.estimator) + #X = check_array(X) + + preds = self.estimator.predict(X,**predict_params) + return preds + + @available_if(_estimator_has('predict_proba')) + def predict_proba(self, X, **predict_params): + check_is_fitted(self.estimator) + #X = check_array(X) + return self.estimator.predict_proba(X,**predict_params) + + @available_if(_estimator_has('decision_function')) + def decision_function(self, X, **predict_params): + check_is_fitted(self.estimator) + #X = check_array(X) + return self.estimator.decision_function(X,**predict_params) + + def __sklearn_is_fitted__(self): + """ + Check fitted status and return a Boolean value. + """ + return check_is_fitted(self.estimator) + + + # @property + # def _estimator_type(self): + # return self.estimator._estimator_type + + + + @property + def classes_(self): + """The classes labels. Only exist if the last step is a classifier.""" + return self.estimator._classes \ No newline at end of file diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py index 05a76920..05187067 100644 --- a/tpot2/config/get_configspace.py +++ b/tpot2/config/get_configspace.py @@ -393,19 +393,23 @@ def get_configspace(name, n_classes=3, n_samples=100, n_features=100, random_sta raise ValueError(f"Could not find configspace for {name}") -def get_search_space(name, n_classes=3, n_samples=100, n_features=100, random_state=None): +def get_search_space(name, n_classes=3, n_samples=100, n_features=100, random_state=None, return_choice_pipeline=True): #if list of names, return a list of EstimatorNodes if isinstance(name, list) or isinstance(name, np.ndarray): - search_spaces = [get_search_space(n, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state) for n in name] + search_spaces = [get_search_space(n, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, return_choice_pipeline=False) for n in name] #remove Nones search_spaces = [s for s in search_spaces if s is not None] - return ChoicePipeline(search_spaces=search_spaces) + + if return_choice_pipeline: + return ChoicePipeline(search_spaces=np.hstack(search_spaces)) + else: + return np.hstack(search_spaces) if name in GROUPNAMES: name_list = GROUPNAMES[name] - return get_search_space(name_list, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state) + return get_search_space(name_list, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, return_choice_pipeline=return_choice_pipeline) return get_node(name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state) diff --git a/tpot2/search_spaces/pipelines/__init__.py b/tpot2/search_spaces/pipelines/__init__.py index ec90eb0e..2c17a950 100644 --- a/tpot2/search_spaces/pipelines/__init__.py +++ b/tpot2/search_spaces/pipelines/__init__.py @@ -3,4 +3,6 @@ from .sequential import * from .graph import * from .tree import * -from .wrapper import * \ No newline at end of file +from .wrapper import * +from .dynamicunion import * +from .union import * \ No newline at end of file diff --git a/tpot2/search_spaces/pipelines/dynamicunion.py b/tpot2/search_spaces/pipelines/dynamicunion.py index d5cd9b37..72b9cb7c 100644 --- a/tpot2/search_spaces/pipelines/dynamicunion.py +++ b/tpot2/search_spaces/pipelines/dynamicunion.py @@ -135,7 +135,7 @@ def _crossover_inner_step(self, other, rng): return crossover_success def export_pipeline(self): - return sklearn.pipeline.make_pipeline(*[step.export_pipeline() for step in self.pipeline]) + return sklearn.pipeline.make_union(*[step.export_pipeline() for step in self.pipeline]) def unique_id(self): l = [step.unique_id() for step in self.pipeline] diff --git a/tpot2/search_spaces/pipelines/union.py b/tpot2/search_spaces/pipelines/union.py index c2071bdc..c95408fe 100644 --- a/tpot2/search_spaces/pipelines/union.py +++ b/tpot2/search_spaces/pipelines/union.py @@ -70,7 +70,7 @@ def _crossover_inner_step(self, other, rng): return crossover_success def export_pipeline(self): - return sklearn.pipeline.FeatureUnion(transformer_list=[step.export_pipeline() for step in self.pipeline]) + return sklearn.pipeline.make_union(*[step.export_pipeline() for step in self.pipeline]) def unique_id(self): l = [step.unique_id() for step in self.pipeline] diff --git a/tpot2/search_spaces/pipelines/wrapper.py b/tpot2/search_spaces/pipelines/wrapper.py index 32f78a2e..62662720 100644 --- a/tpot2/search_spaces/pipelines/wrapper.py +++ b/tpot2/search_spaces/pipelines/wrapper.py @@ -29,8 +29,13 @@ def __init__( self.method = method self.space = space rng = np.random.default_rng(rng) - self.space.seed(rng.integers(0, 2**32)) - self.hyperparameters = dict(self.space.sample_configuration()) + + if isinstance(space, dict): + self.hyperparameters = space + else: + rng = np.random.default_rng(rng) + self.space.seed(rng.integers(0, 2**32)) + self.hyperparameters = dict(self.space.sample_configuration()) self.hyperparameters_parser = hyperparameter_parser @@ -43,6 +48,8 @@ def mutate(self, rng=None): return self._mutate_node(rng) def _mutate_hyperparameters(self, rng=None): + if isinstance(self.space, dict): + return False rng = np.random.default_rng(rng) self.space.seed(rng.integers(0, 2**32)) self.hyperparameters = dict(self.space.sample_configuration()) @@ -83,7 +90,8 @@ def __init__( self, method: type, space: ConfigurationSpace, - estimator_search_space: SklearnIndividualGenerator, + estimator_search_space: SklearnIndividualGenerator, + hyperparameter_parser: callable = None, ) -> None: """ @@ -96,6 +104,7 @@ def __init__( self.estimator_search_space = estimator_search_space self.method = method self.space = space + self.hyperparameter_parser=hyperparameter_parser def generate(self, rng=None): - return WrapperPipelineIndividual(method=self.method, space=self.space, estimator_search_space=self.estimator_search_space, rng=rng) \ No newline at end of file + return WrapperPipelineIndividual(method=self.method, space=self.space, estimator_search_space=self.estimator_search_space, hyperparameter_parser=self.hyperparameter_parser, rng=rng) \ No newline at end of file From 4cba71769535a7547f6b67f9bdf19812410ab663 Mon Sep 17 00:00:00 2001 From: perib Date: Tue, 14 May 2024 20:16:25 -0700 Subject: [PATCH 6/7] functions to convert tpot1 config dicts to tpot2 --- tpot2/__init__.py | 2 +- tpot2/old_config_utils/__init__.py | 1 + tpot2/old_config_utils/old_config_utils.py | 140 +++++++++++++++++++++ tpot2/search_spaces/pipelines/wrapper.py | 17 ++- 4 files changed, 155 insertions(+), 5 deletions(-) create mode 100644 tpot2/old_config_utils/__init__.py create mode 100644 tpot2/old_config_utils/old_config_utils.py diff --git a/tpot2/__init__.py b/tpot2/__init__.py index f19f34b2..62290884 100644 --- a/tpot2/__init__.py +++ b/tpot2/__init__.py @@ -15,7 +15,7 @@ from . import objectives from . import selectors from . import tpot_estimator - +from . import old_config_utils from .tpot_estimator import TPOTClassifier, TPOTRegressor, TPOTEstimator, TPOTEstimatorSteadyState diff --git a/tpot2/old_config_utils/__init__.py b/tpot2/old_config_utils/__init__.py new file mode 100644 index 00000000..e61b81fc --- /dev/null +++ b/tpot2/old_config_utils/__init__.py @@ -0,0 +1 @@ +from .old_config_utils import convert_config_dict_to_list, convert_config_dict_to_choicepipeline, convert_config_dict_to_graphpipeline \ No newline at end of file diff --git a/tpot2/old_config_utils/old_config_utils.py b/tpot2/old_config_utils/old_config_utils.py new file mode 100644 index 00000000..016744da --- /dev/null +++ b/tpot2/old_config_utils/old_config_utils.py @@ -0,0 +1,140 @@ +from ConfigSpace import ConfigurationSpace +from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal +from ConfigSpace import EqualsCondition, OrConjunction, NotEqualsCondition, InCondition +from ..search_spaces.nodes.estimator_node import NONE_SPECIAL_STRING, TRUE_SPECIAL_STRING, FALSE_SPECIAL_STRING +from ..search_spaces.nodes import EstimatorNode +from ..search_spaces.pipelines import WrapperPipeline, ChoicePipeline, GraphPipeline +import ConfigSpace +import sklearn +from functools import partial +import inspect +import numpy as np + +def load_get_module_from_string(module_string): + module_name, class_name = module_string.rsplit('.', 1) + module = __import__(module_name, fromlist=[class_name]) + return getattr(module, class_name) + + +def hyperparameter_parser(hdict, function_params_conversion_dict): + d = hdict.copy() + d.update(function_params_conversion_dict) + return d + + + +def get_node_space(module_string, params): + method = load_get_module_from_string(module_string) + config_space = ConfigurationSpace() + sub_space = None + sub_space_name = None + + function_params_conversion_dict = {} + + if params is None: + return EstimatorNode(method=method, space=config_space) + + for param_name, param in params.items(): + if param is None: + config_space.add_hyperparameter(Categorical(param_name, [NONE_SPECIAL_STRING])) + + if isinstance(param, range): + param = list(param) + + if isinstance(param, list) or isinstance(param, np.ndarray): + if len(param) == 0: + p = param[0] + if p is None: + p = NONE_SPECIAL_STRING + elif type(p) == bool: + p = TRUE_SPECIAL_STRING if p else FALSE_SPECIAL_STRING + + config_space.add_hyperparameter(ConfigSpace.hyperparameters.Constant(param_name, p)) + else: + config_space.add_hyperparameter(Categorical(param_name, param)) + # if all(isinstance(i, int) for i in param): + # config_space.add_hyperparameter(Integer(param_name, (min(param), max(param)))) + # elif all(isinstance(i, float) for i in param): + # config_space.add_hyperparameter(Float(param_name, (min(param), max(param)))) + # else: + # config_space.add_hyperparameter(Categorical(param_name, param)) + elif isinstance(param, dict): #TPOT1 config dicts have dictionaries for values of hyperparameters that are either a function or an estimator + if len(param) > 1: + raise ValueError(f"Multiple items in dictionary entry for {param_name}") + + key = list(param.keys())[0] + + innermethod = load_get_module_from_string(key) + + if inspect.isclass(innermethod) and issubclass(innermethod, sklearn.base.BaseEstimator): #is an estimator + if sub_space is None: + sub_space_name = param_name + sub_space = get_node_space(key, param[key]) + else: + raise ValueError("Only multiple hyperparameters are estimators. Only one parameter ") + + else: #assume the key is a function and ignore the value + function_params_conversion_dict[param_name] = innermethod + + else: + # config_space.add_hyperparameter(Categorical(param_name, param)) + config_space.add_hyperparameter(ConfigSpace.hyperparameters.Constant(param_name, param)) + + parser=None + if len(function_params_conversion_dict) > 0: + parser = partial(hyperparameter_parser, function_params_conversion_dict) + + + if sub_space is None: + + if parser is not None: + return EstimatorNode(method=method, space=config_space, hyperparameter_parser=parser) + else: + return EstimatorNode(method=method, space=config_space) + + + else: + if parser is not None: + return WrapperPipeline(method=method, space=config_space, estimator_search_space=sub_space, wrapped_param_name=sub_space_name, hyperparameter_parser=parser) + else: + return WrapperPipeline(method=method, space=config_space, estimator_search_space=sub_space, wrapped_param_name=sub_space_name) + + +def convert_config_dict_to_list(config_dict): + search_spaces = [] + for key, value in config_dict.items(): + search_spaces.append(get_node_space(key, value)) + return search_spaces + + +def convert_config_dict_to_choicepipeline(config_dict): + search_spaces = [] + for key, value in config_dict.items(): + search_spaces.append(get_node_space(key, value)) + return ChoicePipeline(search_spaces) + + +def convert_config_dict_to_graphpipeline(config_dict): + root_search_spaces = [] + inner_search_spaces = [] + + for key, value in config_dict.items(): + #if root + if issubclass(load_get_module_from_string(key), sklearn.base.ClassifierMixin) or issubclass(load_get_module_from_string(key), sklearn.base.RegressorMixin): + root_search_spaces.append(get_node_space(key, value)) + else: + inner_search_spaces.append(get_node_space(key, value)) + + if len(root_search_spaces) == 0: + Warning("No classifiers or regressors found, allowing any estimator to be the root node") + root_search_spaces = inner_search_spaces + + #merge inner and root search spaces + + inner_space = np.concatenate([root_search_spaces,inner_search_spaces]) + + root_space = ChoicePipeline(root_search_spaces) + inner_space = ChoicePipeline(inner_search_spaces) + + final_space = GraphPipeline(root_search_space=root_space, inner_search_space=inner_space) + return final_space \ No newline at end of file diff --git a/tpot2/search_spaces/pipelines/wrapper.py b/tpot2/search_spaces/pipelines/wrapper.py index 62662720..1cd33bf3 100644 --- a/tpot2/search_spaces/pipelines/wrapper.py +++ b/tpot2/search_spaces/pipelines/wrapper.py @@ -16,19 +16,26 @@ def __init__( space: ConfigurationSpace, estimator_search_space: SklearnIndividualGenerator, hyperparameter_parser: callable = None, + wrapped_param_name: str = None, rng=None) -> None: super().__init__() - self.estimator_search_space = estimator_search_space - self.node = self.estimator_search_space.generate(rng) + + self.method = method self.space = space + self.estimator_search_space = estimator_search_space + self.hyperparameters_parser = hyperparameter_parser + self.wrapped_param_name = wrapped_param_name + + rng = np.random.default_rng(rng) + self.node = self.estimator_search_space.generate(rng) if isinstance(space, dict): self.hyperparameters = space @@ -37,7 +44,7 @@ def __init__( self.space.seed(rng.integers(0, 2**32)) self.hyperparameters = dict(self.space.sample_configuration()) - self.hyperparameters_parser = hyperparameter_parser + def mutate(self, rng=None): @@ -92,6 +99,7 @@ def __init__( space: ConfigurationSpace, estimator_search_space: SklearnIndividualGenerator, hyperparameter_parser: callable = None, + wrapped_param_name: str = None ) -> None: """ @@ -105,6 +113,7 @@ def __init__( self.method = method self.space = space self.hyperparameter_parser=hyperparameter_parser + self.wrapped_param_name = wrapped_param_name def generate(self, rng=None): - return WrapperPipelineIndividual(method=self.method, space=self.space, estimator_search_space=self.estimator_search_space, hyperparameter_parser=self.hyperparameter_parser, rng=rng) \ No newline at end of file + return WrapperPipelineIndividual(method=self.method, space=self.space, estimator_search_space=self.estimator_search_space, hyperparameter_parser=self.hyperparameter_parser, wrapped_param_name=self.wrapped_param_name, rng=rng) \ No newline at end of file From d30c29182095bdf8fc2aefcc0f14ee2d12704974 Mon Sep 17 00:00:00 2001 From: perib Date: Tue, 14 May 2024 20:22:58 -0700 Subject: [PATCH 7/7] small note --- tpot2/old_config_utils/old_config_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot2/old_config_utils/old_config_utils.py b/tpot2/old_config_utils/old_config_utils.py index 016744da..82892758 100644 --- a/tpot2/old_config_utils/old_config_utils.py +++ b/tpot2/old_config_utils/old_config_utils.py @@ -113,7 +113,7 @@ def convert_config_dict_to_choicepipeline(config_dict): search_spaces.append(get_node_space(key, value)) return ChoicePipeline(search_spaces) - +#Note doesn't convert estimators so they passthrough inputs like in TPOT1 def convert_config_dict_to_graphpipeline(config_dict): root_search_spaces = [] inner_search_spaces = []