kjappelbaum · kjappelbaum · May 20, 2022 · May 23, 2022 · Jun 14, 2022 · Jun 14, 2022
diff --git a/src/pyepal/pal/core.py b/src/pyepal/pal/core.py
@@ -275,7 +275,7 @@ def _get_max_wt(  # pylint:disable=too-many-arguments
     sampled: np.array,
     pooling_method: str = "fro",
     use_coef_var: bool = True,
-) -> int:
+) -> Tuple[int, float]:
     """Returns the index in design space with the maximum size of the hyperrectangle
     (scaled by the mean predictions, i.e., effectively,
     we use the coefficient of variation).
@@ -300,7 +300,7 @@ def _get_max_wt(  # pylint:disable=too-many-arguments
     """
     max_uncertainty = -np.inf
     maxid = 0
-
+    uncertainties = []
     pooling_method = pooling_method.lower()
 
     for i in range(0, len(unclassified_t)):  # pylint:disable=consider-using-enumerate
@@ -316,11 +316,12 @@ def _get_max_wt(  # pylint:disable=too-many-arguments
                 uncer = rectangle_ups[i, :] - rectangle_lows[i, :]
 
             uncertainty = _pool(uncer, pooling_method)
+            uncertainties.append(uncertainty)
             if uncertainty > max_uncertainty:
                 max_uncertainty = uncertainty
                 maxid = i
 
-    return maxid
+    return maxid, uncertainties
 
 
 @jit(nopython=True)
@@ -331,7 +332,7 @@ def _get_max_wt_all(  # pylint:disable=too-many-arguments
     sampled: np.array,
     pooling_method: str = "fro",
     use_coef_var: bool = True,
-) -> int:
+) -> Tuple[int, float]:
     """Returns the index in design space with the maximum size of the hyperrectangle
     (scaled by the mean predictions, i.e., effectively,
     we use the coefficient of variation).
@@ -351,10 +352,11 @@ def _get_max_wt_all(  # pylint:disable=too-many-arguments
             the unscaled rectangle sizes
 
     Returns:
-        int: index with maximum size of hyperrectangle
+        Tuple[int, List[float]]: index with maximum size of hyperrectangle, all uncertainties
     """
     max_uncertainty = -np.inf
     maxid = 0
+    uncertainties = []
 
     pooling_method = pooling_method.lower()
 
@@ -370,11 +372,12 @@ def _get_max_wt_all(  # pylint:disable=too-many-arguments
             else:
                 uncer = rectangle_ups[i, :] - rectangle_lows[i, :]
             uncertainty = _pool(uncer, pooling_method)
+            uncertainties.append(uncertainty)
             if uncertainty > max_uncertainty:
                 max_uncertainty = uncertainty
                 maxid = i
 
-    return maxid
+    return maxid, max_uncertainty
 
 
 @jit(nopython=True)

diff --git a/src/pyepal/pal/pal_base.py b/src/pyepal/pal/pal_base.py
@@ -20,7 +20,7 @@
 import logging
 import warnings
 from copy import deepcopy
-from typing import List, Union
+from typing import Iterable, List, Tuple, Union
 
 import numpy as np
 from sklearn.metrics import mean_absolute_error
@@ -69,6 +69,7 @@ def __init__(  # pylint:disable=too-many-arguments
         goals: List[str] = None,
         coef_var_threshold: float = 3,
         ranges: Union[np.ndarray, None] = None,
+        pooling_method: str = "fro",
     ):
         r"""Initialize the PAL instance
 
@@ -95,6 +96,10 @@ def __init__(  # pylint:disable=too-many-arguments
                 If this is provided, we will use :math:`\epsilon \cdot ranges`
                 to computer the uncertainties of the hyperrectangles instead
                 of the default behavior :math:`\epsilon \cdot |\mu|`
+            pooling_method (str): Method that is used to aggregate
+                the uncertainty in different objectives into one scalar.
+                Available options are:  "fro" (Frobenius/Euclidean norm), "mean",
+                "median". Defaults to "fro".
 
         """
         self.cross_val_points = 10  # maybe we make it an argument at some point
@@ -130,6 +135,7 @@ def __init__(  # pylint:disable=too-many-arguments
         # measurement_uncertainty is provided in update_train_set by the user
         self.measurement_uncertainty = np.zeros((design_space_size, self.ndim))
         self._has_train_set = False
+        self.pooling_method = pooling_method
 
     def __repr__(self):
         return f"pyepal at iteration {self.iteration}. \
@@ -441,28 +447,31 @@ def _replace_by_measurements(self, replace_mean: bool = True, replace_std: bool
     def run_one_step(  # pylint:disable=too-many-arguments
         self,
         batch_size: int = 1,
-        pooling_method: str = "fro",
         sample_discarded: bool = False,
         use_coef_var: bool = True,
         replace_mean: bool = True,
         replace_std: bool = True,
+        replacement_models: Iterable[any] = None,
     ) -> Union[np.array, None]:
-        """[summary]
+        """Run one iteration of the PAL algorithm. That is, train the models,
+        get the predictions for all the design points and then classify them.
+        After classification, return the samples. We do not update the "sampled"
+        attrobute here.
 
         Args:
             batch_size (int, optional): Number of indices that will be returned.
+                If >1 then we use a greedy approximation.
                 Defaults to 1.
-            pooling_method (str): Method that is used to aggregate
-                the uncertainty in different objectives into one scalar.
-                Available options are:  "fro" (Frobenius/Euclidean norm), "mean",
-                "median". Defaults to "fro".
             sample_discarded (bool): if true, it will sample from all points
                 and not only from the unclassified and Pareto optimal ones
             use_coef_var (bool): If True, uses the coefficient of variation instead of
                 the unscaled rectangle sizes
             replace_mean (bool): If true uses the measured _means for the sampled points
             replace_std (bool): If true uses the measured standard deviation for the
                 sampled points
+            replacement_models: A list of models that will be used to replace the models.
+                If the models are provide we skip the hyperparameter optimization and training. If is useful if, for some reason, the same model is trained somewhere else in parallel. Providing this takes precedence over hyperparameter and training schedules.
+                Defaults to None.
 
         Raises:
             ValueError: In case the PAL instance was not initialized with
@@ -482,10 +491,15 @@ def run_one_step(  # pylint:disable=too-many-arguments
         if self.should_cross_validate():
             self._compare_mae_variance()
 
-        if self._should_optimize_hyperparameters():
-            self._set_hyperparameters()
+        if replacement_models is None:
+            if self._should_optimize_hyperparameters():
+                self._set_hyperparameters()
+
+            self._train()
+        else:
+            PAL_LOGGER.debug("Replacing models with provided ones.")
+            self.models = replacement_models
 
-        self._train()
         self._predict()
 
         self._update_beta()
@@ -500,7 +514,6 @@ def run_one_step(  # pylint:disable=too-many-arguments
             for _ in range(batch_size):
                 sampled_idx = self.sample(
                     exclude_idx=samples,
-                    pooling_method=pooling_method,
                     sample_discarded=sample_discarded,
                     use_coef_var=use_coef_var,
                 )
@@ -692,21 +705,16 @@ def augment_design_space(  # pylint: disable=invalid-name
     def sample(
         self,
         exclude_idx: Union[np.array, None] = None,
-        pooling_method: str = "fro",
         sample_discarded: bool = False,
         use_coef_var: bool = True,
-    ) -> int:
+    ) -> Tuple[int, float]:
         """Runs the sampling step based on the size of the hyperrectangle.
         I.e., favoring exploration.
 
         Args:
             exclude_idx (Union[np.array, None], optional):
                 Points in design space to exclude from sampling.
                 Defaults to None.
-            pooling_method (str): Method that is used to aggregate
-                the uncertainty in different objectives into one scalar.
-                Available options are:  "fro" (Frobenius/Euclidean norm), "mean",
-                "median". Defaults to "fro".
             sample_discarded (bool): if true, it will sample from all points
                 and not only from the unclassified and Pareto optimal ones
             use_coef_var (bool): If True, uses the coefficient of variation instead of
@@ -736,24 +744,24 @@ def sample(
                 sampled_mask += exclude_mask
 
         if sample_discarded:
-            sampled_idx = _get_max_wt_all(
+            sampled_idx, _uncertainty = _get_max_wt_all(
                 self.rectangle_lows,
                 self.rectangle_ups,
                 self._means,
                 sampled_mask,
-                pooling_method,
+                self.pooling_method,
                 use_coef_var,
             )
         else:
-            sampled_idx = _get_max_wt(
+            sampled_idx, _uncertainty = _get_max_wt(
                 self.rectangle_lows,
                 self.rectangle_ups,
                 self._means,
                 self.pareto_optimal,
                 self.unclassified,
                 sampled_mask,
-                pooling_method,
+                self.pooling_method,
                 use_coef_var,
             )
 
-        return sampled_idx
+        return sampled_idx, _uncertainty
diff --git a/src/pyepal/pal/pal_ensemble.py b/src/pyepal/pal/pal_ensemble.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+
+
+class PALEnsemble:
+    def __init__(self, pal_list, reuse_models=False):
+        self.pal_list = pal_list
+
+        # we just pick one class where we will update the models
+        self.head_pal = pal_list[0]
+        self.reuse_models = reuse_models
+
+    @classmethod
+    def from_class_and_kwarg_lists(pal_class, **kwargs):
+
+        # Throw error if there are no kwargs
+        if not kwargs:
+            raise ValueError("No kwargs provided")
+
+        pal_list = []
+        iterable_keys = []
+        for key, value in kwargs.items():
+            if isinstance(value, list, tuple):
+                iterable_keys.append(key)
+
+        # the problem is here that we would still need to account for the fact that some arguments by themselves are
+        # iterable, but not the others. The coding will be much easier if we just, for every model, accept its kwargs
+
+        if len(iterable_keys) == 0:
+            raise ValueError(
+                "No iterable keys found in kwargs. If you do not provide iterable keys, please use a single PAL instance."
+            )
+
+        num_values = len(kwargs[iterable_keys[0]])
+
+        for key in iterable_keys:
+            if len(kwargs[key]) != num_values:
+                raise ValueError(
+                    "All iterable keys must have the same length. Please check the length of your iterable keys."
+                )
+
+        for i in range(num_values):
+            this_kwargs = {}
+            for key, value in kwargs.items():
+                if key in iterable_keys:
+                    this_kwargs[key] = value[i]
+                else:
+                    this_kwargs[key] = value
+            pal_list.append(pal_class(**this_kwargs))
+        return PALEnsemble(pal_list)
+
+    def run_one_step(
+        self,
+        batch_size: int = 1,
+        sample_discarded: bool = False,
+        use_coef_var: bool = True,
+        replace_mean: bool = True,
+        replace_std: bool = True,
+    ):
+        samples = []
+        uncertainties = []
+        head_samples, head_uncertainties = self.head_pal.run_one_step(
+            batch_size, sample_discarded, use_coef_var, replace_mean, replace_std
+        )
+
+        if isinstance(head_samples, int):
+            head_samples = [head_samples]
+        if isinstance(head_uncertainties, float):
+            head_uncertainties = [head_uncertainties]
+        uncertainties.extend(head_uncertainties)
+        samples.extend(head_samples)
+
+        for pal in self.pal_list[1:]:
+            this_samples, this_uncertainties = pal.run_one_step(
+                batch_size,
+                sample_discarded,
+                use_coef_var,
+                replace_mean,
+                replace_std,
+                replacement_models=self.head_pal.models if self.reuse_models else None,
+            )
+
+            this_uncertainties = np.array(this_uncertainties)
+            this_uncertainties = (
+                this_uncertainties - this_uncertainties.mean()
+            ) / this_uncertainties.std()
+            if isinstance(this_samples, int):
+                this_samples = [this_samples]
+            if isinstance(this_uncertainties, float):
+                this_uncertainties = [this_uncertainties]
+            samples.extend(this_samples)
+            uncertainties.extend(this_uncertainties)
+
+        uncertainties_sorted, indices_sorted = zip(*sorted(zip(uncertainties, samples)))
+        uncertainties_sorted = np.array(uncertainties_sorted)
+        indices_sorted = np.array(indices_sorted)
+        _, original_sorted_indices = np.unique(indices_sorted, return_index=True)
+        indices_selected = indices_sorted[original_sorted_indices]
+        return indices_selected[-batch_size:], uncertainties_sorted[-batch_size:]
+
+    def augment_design_space(  # pylint: disable=invalid-name
+        self, X_design: np.ndarray, classify: bool = False, clean_classify: bool = True
+    ) -> None:
+        for pal in self.pal_list:
+            pal.augment_design_space(X_design, classify, clean_classify)
+
+    def update_train_set(
+        self,
+        indices: np.ndarray,
+        measurements: np.ndarray,
+        measurement_uncertainty: np.ndarray = None,
+    ) -> None:
+        for pal in self.pal_list:
+            pal.update_train_set(indices, measurements, measurement_uncertainty)
diff --git a/tests/test_pal_ensemble.py b/tests/test_pal_ensemble.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+import pytest
+
+from pyepal.pal.pal_ensemble import PALEnsemble
+
+
+def test_pal_ensemble_init(make_random_dataset):
+    from pyepal.models.gpr import build_model
+    from pyepal.pal.pal_gpy import PALGPy
+
+    X, y = make_random_dataset
+    sample_idx = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    # with pytest.raises(ValueError):
+    #     # Shouldn't work if there are no kwargs
+
+    #     ensemble = PALEnsemble.from_class_and_kwarg_lists(PALGPy, [])
+    m0 = build_model(X, y, 0)  # pylint:disable=invalid-name
+    m1 = build_model(X, y, 1)  # pylint:disable=invalid-name
+    m2 = build_model(X, y, 2)  # pylint:disable=invalid-name
+
+    palgpy_instance = PALGPy(
+        X,
+        models=[m0, m1, m2],
+        ndim=3,
+        delta=0.01,
+        pooling_method="fro",
+        restarts=3,
+    )
+    palgpy_instance_2 = PALGPy(
+        X,
+        models=[m0, m1, m2],
+        ndim=3,
+        delta=0.01,
+        pooling_method="mean",
+        restarts=3,
+    )
+    palgpy_instance.cross_val_points = 0
+    palgpy_instance_2.cross_val_points = 0
+    pal_ensemble = PALEnsemble([palgpy_instance, palgpy_instance_2])
+    pal_ensemble.update_train_set(sample_idx, y[sample_idx])
+    sample, _ = pal_ensemble.run_one_step(1)
+    assert len(sample) == 1
+
+    sample, _ = pal_ensemble.run_one_step(20)
+    assert len(sample) == 20