From ee2a4ed0dc6100d571da6c9aa9fb3fbc3f347f24 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Fri, 20 May 2022 21:26:38 +0200 Subject: [PATCH 1/4] Ensemble of PAL classes Fixes #228 --- src/pyepal/pal/core.py | 10 ++-- src/pyepal/pal/pal_base.py | 39 +++++++++----- src/pyepal/pal/pal_ensemble.py | 94 ++++++++++++++++++++++++++++++++++ tests/test_pal_ensemble.py | 0 4 files changed, 125 insertions(+), 18 deletions(-) create mode 100644 src/pyepal/pal/pal_ensemble.py create mode 100644 tests/test_pal_ensemble.py diff --git a/src/pyepal/pal/core.py b/src/pyepal/pal/core.py index 331d897..e23c7c4 100644 --- a/src/pyepal/pal/core.py +++ b/src/pyepal/pal/core.py @@ -275,7 +275,7 @@ def _get_max_wt( # pylint:disable=too-many-arguments sampled: np.array, pooling_method: str = "fro", use_coef_var: bool = True, -) -> int: +) -> Tuple[int, float]: """Returns the index in design space with the maximum size of the hyperrectangle (scaled by the mean predictions, i.e., effectively, we use the coefficient of variation). @@ -320,7 +320,7 @@ def _get_max_wt( # pylint:disable=too-many-arguments max_uncertainty = uncertainty maxid = i - return maxid + return maxid, max_uncertainty @jit(nopython=True) @@ -331,7 +331,7 @@ def _get_max_wt_all( # pylint:disable=too-many-arguments sampled: np.array, pooling_method: str = "fro", use_coef_var: bool = True, -) -> int: +) -> Tuple[int, float]: """Returns the index in design space with the maximum size of the hyperrectangle (scaled by the mean predictions, i.e., effectively, we use the coefficient of variation). @@ -351,7 +351,7 @@ def _get_max_wt_all( # pylint:disable=too-many-arguments the unscaled rectangle sizes Returns: - int: index with maximum size of hyperrectangle + Tuple[int, List[float]]: index with maximum size of hyperrectangle, all uncertainties """ max_uncertainty = -np.inf maxid = 0 @@ -374,7 +374,7 @@ def _get_max_wt_all( # pylint:disable=too-many-arguments max_uncertainty = uncertainty maxid = i - return maxid + return maxid, max_uncertainty @jit(nopython=True) diff --git a/src/pyepal/pal/pal_base.py b/src/pyepal/pal/pal_base.py index 12d42b9..dc65ed6 100644 --- a/src/pyepal/pal/pal_base.py +++ b/src/pyepal/pal/pal_base.py @@ -20,7 +20,7 @@ import logging import warnings from copy import deepcopy -from typing import List, Union +from typing import Iterable, List, Union import numpy as np from sklearn.metrics import mean_absolute_error @@ -69,6 +69,7 @@ def __init__( # pylint:disable=too-many-arguments goals: List[str] = None, coef_var_threshold: float = 3, ranges: Union[np.ndarray, None] = None, + pooling_method: str = "fro", ): r"""Initialize the PAL instance @@ -95,6 +96,10 @@ def __init__( # pylint:disable=too-many-arguments If this is provided, we will use :math:`\epsilon \cdot ranges` to computer the uncertainties of the hyperrectangles instead of the default behavior :math:`\epsilon \cdot |\mu|` + pooling_method (str): Method that is used to aggregate + the uncertainty in different objectives into one scalar. + Available options are: "fro" (Frobenius/Euclidean norm), "mean", + "median". Defaults to "fro". """ self.cross_val_points = 10 # maybe we make it an argument at some point @@ -441,21 +446,21 @@ def _replace_by_measurements(self, replace_mean: bool = True, replace_std: bool def run_one_step( # pylint:disable=too-many-arguments self, batch_size: int = 1, - pooling_method: str = "fro", sample_discarded: bool = False, use_coef_var: bool = True, replace_mean: bool = True, replace_std: bool = True, + replacement_models: Iterable[any] = None, ) -> Union[np.array, None]: - """[summary] + """Run one iteration of the PAL algorithm. That is, train the models, + get the predictions for all the design points and then classify them. + After classification, return the samples. We do not update the "sampled" + attrobute here. Args: batch_size (int, optional): Number of indices that will be returned. + If >1 then we use a greedy approximation. Defaults to 1. - pooling_method (str): Method that is used to aggregate - the uncertainty in different objectives into one scalar. - Available options are: "fro" (Frobenius/Euclidean norm), "mean", - "median". Defaults to "fro". sample_discarded (bool): if true, it will sample from all points and not only from the unclassified and Pareto optimal ones use_coef_var (bool): If True, uses the coefficient of variation instead of @@ -463,6 +468,9 @@ def run_one_step( # pylint:disable=too-many-arguments replace_mean (bool): If true uses the measured _means for the sampled points replace_std (bool): If true uses the measured standard deviation for the sampled points + replacement_models: A list of models that will be used to replace the models. + If the models are provide we skip the hyperparameter optimization and training. If is useful if, for some reason, the same model is trained somewhere else in parallel. Providing this takes precedence over hyperparameter and training schedules. + Defaults to None. Raises: ValueError: In case the PAL instance was not initialized with @@ -482,10 +490,15 @@ def run_one_step( # pylint:disable=too-many-arguments if self.should_cross_validate(): self._compare_mae_variance() - if self._should_optimize_hyperparameters(): - self._set_hyperparameters() + if replacement_models is None: + if self._should_optimize_hyperparameters(): + self._set_hyperparameters() + + self._train() + else: + PAL_LOGGER.debug("Replacing models with provided ones.") + self.models = replacement_models - self._train() self._predict() self._update_beta() @@ -500,7 +513,7 @@ def run_one_step( # pylint:disable=too-many-arguments for _ in range(batch_size): sampled_idx = self.sample( exclude_idx=samples, - pooling_method=pooling_method, + pooling_method=self.pooling_method, sample_discarded=sample_discarded, use_coef_var=use_coef_var, ) @@ -736,7 +749,7 @@ def sample( sampled_mask += exclude_mask if sample_discarded: - sampled_idx = _get_max_wt_all( + sampled_idx, _uncertainty = _get_max_wt_all( self.rectangle_lows, self.rectangle_ups, self._means, @@ -745,7 +758,7 @@ def sample( use_coef_var, ) else: - sampled_idx = _get_max_wt( + sampled_idx, _uncertainty = _get_max_wt( self.rectangle_lows, self.rectangle_ups, self._means, diff --git a/src/pyepal/pal/pal_ensemble.py b/src/pyepal/pal/pal_ensemble.py new file mode 100644 index 0000000..c75d5a2 --- /dev/null +++ b/src/pyepal/pal/pal_ensemble.py @@ -0,0 +1,94 @@ +import numpy as np + + +class PALEnsemble: + def __init__(self, pal_list): + self.pal_list = pal_list + + # we just pick one class where we will update the models + self.head_pal = pal_list[0] + + @classmethod + def from_class_and_kwarg_lists(pal_class, **kwargs): + pal_list = [] + iterable_keys = [] + for key, value in kwargs.items(): + if isinstance(value, list, tuple): + iterable_keys.append(key) + + if len(iterable_keys) == 0: + raise ValueError( + "No iterable keys found in kwargs. If you do not provide iterable keys, please use a single PAL instance." + ) + + num_values = len(kwargs[iterable_keys[0]]) + + for key in iterable_keys: + if len(kwargs[key]) != num_values: + raise ValueError( + "All iterable keys must have the same length. Please check the length of your iterable keys." + ) + + for i in range(num_values): + this_kwargs = {} + for key, value in kwargs.items(): + if key in iterable_keys: + this_kwargs[key] = value[i] + else: + this_kwargs[key] = value + pal_list.append(pal_class(**this_kwargs)) + return PALEnsemble(pal_list) + + def run_one_step( + self, + batch_size: int = 1, + pooling_method: str = "fro", + sample_discarded: bool = False, + use_coef_var: bool = True, + replace_mean: bool = True, + replace_std: bool = True, + ): + samples = [] + uncertainties = [] + head_samples, head_uncertainties = self.head_pal.run_one_step( + batch_size, pooling_method, sample_discarded, use_coef_var, replace_mean, replace_std + ) + samples.extend(head_samples) + uncertainties.extend(head_uncertainties) + + samples.extend(head_samples) + + for pal in self.pal_list[1:]: + this_samples, this_uncertainties = pal.run_one_step( + batch_size, + pooling_method, + sample_discarded, + use_coef_var, + replace_mean, + replace_std, + replace_models=self.head_pal.models, + ) + samples.extend(this_samples) + uncertainties.extend(this_uncertainties) + + uncertainties_sorted, indices_sorted = zip(*sorted(zip(uncertainties, samples))) + uncertainties_sorted = np.array(uncertainties_sorted) + indices_sorted = np.array(indices_sorted) + _, original_sorted_indices = np.unique(indices_sorted, return_index=True) + indices_selected = indices_sorted[original_sorted_indices] + return indices_selected[-batch_size:], uncertainties_sorted[-batch_size:] + + def augment_design_space( # pylint: disable=invalid-name + self, X_design: np.ndarray, classify: bool = False, clean_classify: bool = True + ) -> None: + for pal in self.pal_list: + pal.augment_design_space(X_design, classify, clean_classify) + + def update_train_set( + self, + indices: np.ndarray, + measurements: np.ndarray, + measurement_uncertainty: np.ndarray = None, + ) -> None: + for pal in self.pal_list: + pal.update_train_set(indices, measurements, measurement_uncertainty) diff --git a/tests/test_pal_ensemble.py b/tests/test_pal_ensemble.py new file mode 100644 index 0000000..e69de29 From e0acc3b041c739bb35feb73cc030b74f6080c3ff Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Mon, 23 May 2022 19:02:59 +0200 Subject: [PATCH 2/4] wip: working on ensembling --- src/pyepal/pal/core.py | 7 +++++-- src/pyepal/pal/pal_base.py | 17 ++++++----------- src/pyepal/pal/pal_ensemble.py | 33 ++++++++++++++++++++++++++------- tests/test_pal_ensemble.py | 26 ++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 20 deletions(-) diff --git a/src/pyepal/pal/core.py b/src/pyepal/pal/core.py index e23c7c4..a58dfc8 100644 --- a/src/pyepal/pal/core.py +++ b/src/pyepal/pal/core.py @@ -300,7 +300,7 @@ def _get_max_wt( # pylint:disable=too-many-arguments """ max_uncertainty = -np.inf maxid = 0 - + uncertainties = [] pooling_method = pooling_method.lower() for i in range(0, len(unclassified_t)): # pylint:disable=consider-using-enumerate @@ -316,11 +316,12 @@ def _get_max_wt( # pylint:disable=too-many-arguments uncer = rectangle_ups[i, :] - rectangle_lows[i, :] uncertainty = _pool(uncer, pooling_method) + uncertainties.append(uncertainty) if uncertainty > max_uncertainty: max_uncertainty = uncertainty maxid = i - return maxid, max_uncertainty + return maxid, uncertainties @jit(nopython=True) @@ -355,6 +356,7 @@ def _get_max_wt_all( # pylint:disable=too-many-arguments """ max_uncertainty = -np.inf maxid = 0 + uncertainties = [] pooling_method = pooling_method.lower() @@ -370,6 +372,7 @@ def _get_max_wt_all( # pylint:disable=too-many-arguments else: uncer = rectangle_ups[i, :] - rectangle_lows[i, :] uncertainty = _pool(uncer, pooling_method) + uncertainties.append(uncertainty) if uncertainty > max_uncertainty: max_uncertainty = uncertainty maxid = i diff --git a/src/pyepal/pal/pal_base.py b/src/pyepal/pal/pal_base.py index dc65ed6..da54126 100644 --- a/src/pyepal/pal/pal_base.py +++ b/src/pyepal/pal/pal_base.py @@ -20,7 +20,7 @@ import logging import warnings from copy import deepcopy -from typing import Iterable, List, Union +from typing import Iterable, List, Union, Tuple import numpy as np from sklearn.metrics import mean_absolute_error @@ -135,6 +135,7 @@ def __init__( # pylint:disable=too-many-arguments # measurement_uncertainty is provided in update_train_set by the user self.measurement_uncertainty = np.zeros((design_space_size, self.ndim)) self._has_train_set = False + self.pooling_method = pooling_method def __repr__(self): return f"pyepal at iteration {self.iteration}. \ @@ -513,7 +514,6 @@ def run_one_step( # pylint:disable=too-many-arguments for _ in range(batch_size): sampled_idx = self.sample( exclude_idx=samples, - pooling_method=self.pooling_method, sample_discarded=sample_discarded, use_coef_var=use_coef_var, ) @@ -705,10 +705,9 @@ def augment_design_space( # pylint: disable=invalid-name def sample( self, exclude_idx: Union[np.array, None] = None, - pooling_method: str = "fro", sample_discarded: bool = False, use_coef_var: bool = True, - ) -> int: + ) -> Tuple[int, float]: """Runs the sampling step based on the size of the hyperrectangle. I.e., favoring exploration. @@ -716,10 +715,6 @@ def sample( exclude_idx (Union[np.array, None], optional): Points in design space to exclude from sampling. Defaults to None. - pooling_method (str): Method that is used to aggregate - the uncertainty in different objectives into one scalar. - Available options are: "fro" (Frobenius/Euclidean norm), "mean", - "median". Defaults to "fro". sample_discarded (bool): if true, it will sample from all points and not only from the unclassified and Pareto optimal ones use_coef_var (bool): If True, uses the coefficient of variation instead of @@ -754,7 +749,7 @@ def sample( self.rectangle_ups, self._means, sampled_mask, - pooling_method, + self.pooling_method, use_coef_var, ) else: @@ -765,8 +760,8 @@ def sample( self.pareto_optimal, self.unclassified, sampled_mask, - pooling_method, + self.pooling_method, use_coef_var, ) - return sampled_idx + return sampled_idx, _uncertainty diff --git a/src/pyepal/pal/pal_ensemble.py b/src/pyepal/pal/pal_ensemble.py index c75d5a2..c57a5b8 100644 --- a/src/pyepal/pal/pal_ensemble.py +++ b/src/pyepal/pal/pal_ensemble.py @@ -2,20 +2,29 @@ class PALEnsemble: - def __init__(self, pal_list): + def __init__(self, pal_list, reuse_models=False): self.pal_list = pal_list # we just pick one class where we will update the models self.head_pal = pal_list[0] + self.reuse_models = reuse_models @classmethod def from_class_and_kwarg_lists(pal_class, **kwargs): + + # Throw error if there are no kwargs + if not kwargs: + raise ValueError("No kwargs provided") + pal_list = [] iterable_keys = [] for key, value in kwargs.items(): if isinstance(value, list, tuple): iterable_keys.append(key) + # the problem is here that we would still need to account for the fact that some arguments by themselves are + # iterable, but not the others. The coding will be much easier if we just, for every model, accept its kwargs + if len(iterable_keys) == 0: raise ValueError( "No iterable keys found in kwargs. If you do not provide iterable keys, please use a single PAL instance." @@ -42,7 +51,6 @@ def from_class_and_kwarg_lists(pal_class, **kwargs): def run_one_step( self, batch_size: int = 1, - pooling_method: str = "fro", sample_discarded: bool = False, use_coef_var: bool = True, replace_mean: bool = True, @@ -51,23 +59,34 @@ def run_one_step( samples = [] uncertainties = [] head_samples, head_uncertainties = self.head_pal.run_one_step( - batch_size, pooling_method, sample_discarded, use_coef_var, replace_mean, replace_std + batch_size, sample_discarded, use_coef_var, replace_mean, replace_std ) - samples.extend(head_samples) - uncertainties.extend(head_uncertainties) + if isinstance(head_samples, int): + head_samples = [head_samples] + if isinstance(head_uncertainties, float): + head_uncertainties = [head_uncertainties] + uncertainties.extend(head_uncertainties) samples.extend(head_samples) for pal in self.pal_list[1:]: this_samples, this_uncertainties = pal.run_one_step( batch_size, - pooling_method, sample_discarded, use_coef_var, replace_mean, replace_std, - replace_models=self.head_pal.models, + replacement_models=self.head_pal.models if self.reuse_models else None, ) + + this_uncertainties = np.array(this_uncertainties) + this_uncertainties = ( + this_uncertainties - this_uncertainties.mean() + ) / this_uncertainties.std() + if isinstance(this_samples, int): + this_samples = [this_samples] + if isinstance(this_uncertainties, float): + this_uncertainties = [this_uncertainties] samples.extend(this_samples) uncertainties.extend(this_uncertainties) diff --git a/tests/test_pal_ensemble.py b/tests/test_pal_ensemble.py index e69de29..b685edd 100644 --- a/tests/test_pal_ensemble.py +++ b/tests/test_pal_ensemble.py @@ -0,0 +1,26 @@ +from pyepal.pal.pal_ensemble import PALEnsemble +import pytest +import numpy as np + + +def test_pal_ensemble_init(make_random_dataset): + from pyepal.pal.pal_gpy import PALGPy + from pyepal.models.gpr import build_model + + X, y = make_random_dataset + sample_idx = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + # with pytest.raises(ValueError): + # # Shouldn't work if there are no kwargs + + # ensemble = PALEnsemble.from_class_and_kwarg_lists(PALGPy, []) + m0 = build_model(X, y, 0) # pylint:disable=invalid-name + m1 = build_model(X, y, 1) # pylint:disable=invalid-name + m2 = build_model(X, y, 2) # pylint:disable=invalid-name + + palgpy_instance = PALGPy(X, models=[m0, m1, m2], ndim=3, delta=0.01, pooling_method="fro") + palgpy_instance_2 = PALGPy(X, models=[m0, m1, m2], ndim=3, delta=0.01, pooling_method="mean") + + pal_ensemble = PALEnsemble([palgpy_instance, palgpy_instance_2]) + pal_ensemble.update_train_set(sample_idx, y[sample_idx]) + sample, _ = pal_ensemble.run_one_step(1) + assert len(sample) == 1 From d26d111ae148cb3a3418e1132713d3a897c16c29 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Tue, 14 Jun 2022 12:46:51 +0200 Subject: [PATCH 3/4] wip: prepare ensemble --- tests/test_pal_ensemble.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/tests/test_pal_ensemble.py b/tests/test_pal_ensemble.py index b685edd..06f2392 100644 --- a/tests/test_pal_ensemble.py +++ b/tests/test_pal_ensemble.py @@ -17,10 +17,28 @@ def test_pal_ensemble_init(make_random_dataset): m1 = build_model(X, y, 1) # pylint:disable=invalid-name m2 = build_model(X, y, 2) # pylint:disable=invalid-name - palgpy_instance = PALGPy(X, models=[m0, m1, m2], ndim=3, delta=0.01, pooling_method="fro") - palgpy_instance_2 = PALGPy(X, models=[m0, m1, m2], ndim=3, delta=0.01, pooling_method="mean") - + palgpy_instance = PALGPy( + X, + models=[m0, m1, m2], + ndim=3, + delta=0.01, + pooling_method="fro", + restarts=3, + ) + palgpy_instance_2 = PALGPy( + X, + models=[m0, m1, m2], + ndim=3, + delta=0.01, + pooling_method="mean", + restarts=3, + ) + palgpy_instance.cross_val_points = 0 + palgpy_instance_2.cross_val_points = 0 pal_ensemble = PALEnsemble([palgpy_instance, palgpy_instance_2]) pal_ensemble.update_train_set(sample_idx, y[sample_idx]) sample, _ = pal_ensemble.run_one_step(1) assert len(sample) == 1 + + sample, _ = pal_ensemble.run_one_step(20) + assert len(sample) == 20 From 56682b420b1b60da386db5294a16c533b1241de7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 14 Jun 2022 10:47:20 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/pyepal/pal/pal_base.py | 2 +- src/pyepal/pal/pal_ensemble.py | 1 + tests/test_pal_ensemble.py | 8 +++++--- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/pyepal/pal/pal_base.py b/src/pyepal/pal/pal_base.py index da54126..2bd3212 100644 --- a/src/pyepal/pal/pal_base.py +++ b/src/pyepal/pal/pal_base.py @@ -20,7 +20,7 @@ import logging import warnings from copy import deepcopy -from typing import Iterable, List, Union, Tuple +from typing import Iterable, List, Tuple, Union import numpy as np from sklearn.metrics import mean_absolute_error diff --git a/src/pyepal/pal/pal_ensemble.py b/src/pyepal/pal/pal_ensemble.py index c57a5b8..7076ca4 100644 --- a/src/pyepal/pal/pal_ensemble.py +++ b/src/pyepal/pal/pal_ensemble.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- import numpy as np diff --git a/tests/test_pal_ensemble.py b/tests/test_pal_ensemble.py index 06f2392..b189ff4 100644 --- a/tests/test_pal_ensemble.py +++ b/tests/test_pal_ensemble.py @@ -1,11 +1,13 @@ -from pyepal.pal.pal_ensemble import PALEnsemble -import pytest +# -*- coding: utf-8 -*- import numpy as np +import pytest + +from pyepal.pal.pal_ensemble import PALEnsemble def test_pal_ensemble_init(make_random_dataset): - from pyepal.pal.pal_gpy import PALGPy from pyepal.models.gpr import build_model + from pyepal.pal.pal_gpy import PALGPy X, y = make_random_dataset sample_idx = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])