Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: ensemble of PAL classes #229

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions src/pyepal/pal/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ def _get_max_wt( # pylint:disable=too-many-arguments
sampled: np.array,
pooling_method: str = "fro",
use_coef_var: bool = True,
) -> int:
) -> Tuple[int, float]:
"""Returns the index in design space with the maximum size of the hyperrectangle
(scaled by the mean predictions, i.e., effectively,
we use the coefficient of variation).
Expand All @@ -300,7 +300,7 @@ def _get_max_wt( # pylint:disable=too-many-arguments
"""
max_uncertainty = -np.inf
maxid = 0

uncertainties = []
pooling_method = pooling_method.lower()

for i in range(0, len(unclassified_t)): # pylint:disable=consider-using-enumerate
Expand All @@ -316,11 +316,12 @@ def _get_max_wt( # pylint:disable=too-many-arguments
uncer = rectangle_ups[i, :] - rectangle_lows[i, :]

uncertainty = _pool(uncer, pooling_method)
uncertainties.append(uncertainty)
if uncertainty > max_uncertainty:
max_uncertainty = uncertainty
maxid = i

return maxid
return maxid, uncertainties


@jit(nopython=True)
Expand All @@ -331,7 +332,7 @@ def _get_max_wt_all( # pylint:disable=too-many-arguments
sampled: np.array,
pooling_method: str = "fro",
use_coef_var: bool = True,
) -> int:
) -> Tuple[int, float]:
"""Returns the index in design space with the maximum size of the hyperrectangle
(scaled by the mean predictions, i.e., effectively,
we use the coefficient of variation).
Expand All @@ -351,10 +352,11 @@ def _get_max_wt_all( # pylint:disable=too-many-arguments
the unscaled rectangle sizes

Returns:
int: index with maximum size of hyperrectangle
Tuple[int, List[float]]: index with maximum size of hyperrectangle, all uncertainties
"""
max_uncertainty = -np.inf
maxid = 0
uncertainties = []

pooling_method = pooling_method.lower()

Expand All @@ -370,11 +372,12 @@ def _get_max_wt_all( # pylint:disable=too-many-arguments
else:
uncer = rectangle_ups[i, :] - rectangle_lows[i, :]
uncertainty = _pool(uncer, pooling_method)
uncertainties.append(uncertainty)
if uncertainty > max_uncertainty:
max_uncertainty = uncertainty
maxid = i

return maxid
return maxid, max_uncertainty


@jit(nopython=True)
Expand Down
52 changes: 30 additions & 22 deletions src/pyepal/pal/pal_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import logging
import warnings
from copy import deepcopy
from typing import List, Union
from typing import Iterable, List, Tuple, Union

import numpy as np
from sklearn.metrics import mean_absolute_error
Expand Down Expand Up @@ -69,6 +69,7 @@ def __init__( # pylint:disable=too-many-arguments
goals: List[str] = None,
coef_var_threshold: float = 3,
ranges: Union[np.ndarray, None] = None,
pooling_method: str = "fro",
):
r"""Initialize the PAL instance

Expand All @@ -95,6 +96,10 @@ def __init__( # pylint:disable=too-many-arguments
If this is provided, we will use :math:`\epsilon \cdot ranges`
to computer the uncertainties of the hyperrectangles instead
of the default behavior :math:`\epsilon \cdot |\mu|`
pooling_method (str): Method that is used to aggregate
the uncertainty in different objectives into one scalar.
Available options are: "fro" (Frobenius/Euclidean norm), "mean",
"median". Defaults to "fro".

"""
self.cross_val_points = 10 # maybe we make it an argument at some point
Expand Down Expand Up @@ -130,6 +135,7 @@ def __init__( # pylint:disable=too-many-arguments
# measurement_uncertainty is provided in update_train_set by the user
self.measurement_uncertainty = np.zeros((design_space_size, self.ndim))
self._has_train_set = False
self.pooling_method = pooling_method

def __repr__(self):
return f"pyepal at iteration {self.iteration}. \
Expand Down Expand Up @@ -441,28 +447,31 @@ def _replace_by_measurements(self, replace_mean: bool = True, replace_std: bool
def run_one_step( # pylint:disable=too-many-arguments
self,
batch_size: int = 1,
pooling_method: str = "fro",
sample_discarded: bool = False,
use_coef_var: bool = True,
replace_mean: bool = True,
replace_std: bool = True,
replacement_models: Iterable[any] = None,
) -> Union[np.array, None]:
"""[summary]
"""Run one iteration of the PAL algorithm. That is, train the models,
get the predictions for all the design points and then classify them.
After classification, return the samples. We do not update the "sampled"
attrobute here.

Args:
batch_size (int, optional): Number of indices that will be returned.
If >1 then we use a greedy approximation.
Defaults to 1.
pooling_method (str): Method that is used to aggregate
the uncertainty in different objectives into one scalar.
Available options are: "fro" (Frobenius/Euclidean norm), "mean",
"median". Defaults to "fro".
sample_discarded (bool): if true, it will sample from all points
and not only from the unclassified and Pareto optimal ones
use_coef_var (bool): If True, uses the coefficient of variation instead of
the unscaled rectangle sizes
replace_mean (bool): If true uses the measured _means for the sampled points
replace_std (bool): If true uses the measured standard deviation for the
sampled points
replacement_models: A list of models that will be used to replace the models.
If the models are provide we skip the hyperparameter optimization and training. If is useful if, for some reason, the same model is trained somewhere else in parallel. Providing this takes precedence over hyperparameter and training schedules.
Defaults to None.

Raises:
ValueError: In case the PAL instance was not initialized with
Expand All @@ -482,10 +491,15 @@ def run_one_step( # pylint:disable=too-many-arguments
if self.should_cross_validate():
self._compare_mae_variance()

if self._should_optimize_hyperparameters():
self._set_hyperparameters()
if replacement_models is None:
if self._should_optimize_hyperparameters():
self._set_hyperparameters()

self._train()
else:
PAL_LOGGER.debug("Replacing models with provided ones.")
self.models = replacement_models

self._train()
self._predict()

self._update_beta()
Expand All @@ -500,7 +514,6 @@ def run_one_step( # pylint:disable=too-many-arguments
for _ in range(batch_size):
sampled_idx = self.sample(
exclude_idx=samples,
pooling_method=pooling_method,
sample_discarded=sample_discarded,
use_coef_var=use_coef_var,
)
Expand Down Expand Up @@ -692,21 +705,16 @@ def augment_design_space( # pylint: disable=invalid-name
def sample(
self,
exclude_idx: Union[np.array, None] = None,
pooling_method: str = "fro",
sample_discarded: bool = False,
use_coef_var: bool = True,
) -> int:
) -> Tuple[int, float]:
"""Runs the sampling step based on the size of the hyperrectangle.
I.e., favoring exploration.

Args:
exclude_idx (Union[np.array, None], optional):
Points in design space to exclude from sampling.
Defaults to None.
pooling_method (str): Method that is used to aggregate
the uncertainty in different objectives into one scalar.
Available options are: "fro" (Frobenius/Euclidean norm), "mean",
"median". Defaults to "fro".
sample_discarded (bool): if true, it will sample from all points
and not only from the unclassified and Pareto optimal ones
use_coef_var (bool): If True, uses the coefficient of variation instead of
Expand Down Expand Up @@ -736,24 +744,24 @@ def sample(
sampled_mask += exclude_mask

if sample_discarded:
sampled_idx = _get_max_wt_all(
sampled_idx, _uncertainty = _get_max_wt_all(
self.rectangle_lows,
self.rectangle_ups,
self._means,
sampled_mask,
pooling_method,
self.pooling_method,
use_coef_var,
)
else:
sampled_idx = _get_max_wt(
sampled_idx, _uncertainty = _get_max_wt(
self.rectangle_lows,
self.rectangle_ups,
self._means,
self.pareto_optimal,
self.unclassified,
sampled_mask,
pooling_method,
self.pooling_method,
use_coef_var,
)

return sampled_idx
return sampled_idx, _uncertainty
114 changes: 114 additions & 0 deletions src/pyepal/pal/pal_ensemble.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# -*- coding: utf-8 -*-
import numpy as np


class PALEnsemble:
def __init__(self, pal_list, reuse_models=False):
self.pal_list = pal_list

# we just pick one class where we will update the models
self.head_pal = pal_list[0]
self.reuse_models = reuse_models

@classmethod
def from_class_and_kwarg_lists(pal_class, **kwargs):

# Throw error if there are no kwargs
if not kwargs:
raise ValueError("No kwargs provided")

pal_list = []
iterable_keys = []
for key, value in kwargs.items():
if isinstance(value, list, tuple):
iterable_keys.append(key)

# the problem is here that we would still need to account for the fact that some arguments by themselves are
# iterable, but not the others. The coding will be much easier if we just, for every model, accept its kwargs

if len(iterable_keys) == 0:
raise ValueError(
"No iterable keys found in kwargs. If you do not provide iterable keys, please use a single PAL instance."
)

num_values = len(kwargs[iterable_keys[0]])

for key in iterable_keys:
if len(kwargs[key]) != num_values:
raise ValueError(
"All iterable keys must have the same length. Please check the length of your iterable keys."
)

for i in range(num_values):
this_kwargs = {}
for key, value in kwargs.items():
if key in iterable_keys:
this_kwargs[key] = value[i]
else:
this_kwargs[key] = value
pal_list.append(pal_class(**this_kwargs))
return PALEnsemble(pal_list)

def run_one_step(
self,
batch_size: int = 1,
sample_discarded: bool = False,
use_coef_var: bool = True,
replace_mean: bool = True,
replace_std: bool = True,
):
samples = []
uncertainties = []
head_samples, head_uncertainties = self.head_pal.run_one_step(
batch_size, sample_discarded, use_coef_var, replace_mean, replace_std
)

if isinstance(head_samples, int):
head_samples = [head_samples]
if isinstance(head_uncertainties, float):
head_uncertainties = [head_uncertainties]
uncertainties.extend(head_uncertainties)
samples.extend(head_samples)

for pal in self.pal_list[1:]:
this_samples, this_uncertainties = pal.run_one_step(
batch_size,
sample_discarded,
use_coef_var,
replace_mean,
replace_std,
replacement_models=self.head_pal.models if self.reuse_models else None,
)

this_uncertainties = np.array(this_uncertainties)
this_uncertainties = (
this_uncertainties - this_uncertainties.mean()
) / this_uncertainties.std()
if isinstance(this_samples, int):
this_samples = [this_samples]
if isinstance(this_uncertainties, float):
this_uncertainties = [this_uncertainties]
samples.extend(this_samples)
uncertainties.extend(this_uncertainties)

uncertainties_sorted, indices_sorted = zip(*sorted(zip(uncertainties, samples)))
uncertainties_sorted = np.array(uncertainties_sorted)
indices_sorted = np.array(indices_sorted)
_, original_sorted_indices = np.unique(indices_sorted, return_index=True)
indices_selected = indices_sorted[original_sorted_indices]
return indices_selected[-batch_size:], uncertainties_sorted[-batch_size:]

def augment_design_space( # pylint: disable=invalid-name
self, X_design: np.ndarray, classify: bool = False, clean_classify: bool = True
) -> None:
for pal in self.pal_list:
pal.augment_design_space(X_design, classify, clean_classify)

def update_train_set(
self,
indices: np.ndarray,
measurements: np.ndarray,
measurement_uncertainty: np.ndarray = None,
) -> None:
for pal in self.pal_list:
pal.update_train_set(indices, measurements, measurement_uncertainty)
46 changes: 46 additions & 0 deletions tests/test_pal_ensemble.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
import numpy as np
import pytest

from pyepal.pal.pal_ensemble import PALEnsemble


def test_pal_ensemble_init(make_random_dataset):
from pyepal.models.gpr import build_model
from pyepal.pal.pal_gpy import PALGPy

X, y = make_random_dataset
sample_idx = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
# with pytest.raises(ValueError):
# # Shouldn't work if there are no kwargs

# ensemble = PALEnsemble.from_class_and_kwarg_lists(PALGPy, [])
m0 = build_model(X, y, 0) # pylint:disable=invalid-name
m1 = build_model(X, y, 1) # pylint:disable=invalid-name
m2 = build_model(X, y, 2) # pylint:disable=invalid-name

palgpy_instance = PALGPy(
X,
models=[m0, m1, m2],
ndim=3,
delta=0.01,
pooling_method="fro",
restarts=3,
)
palgpy_instance_2 = PALGPy(
X,
models=[m0, m1, m2],
ndim=3,
delta=0.01,
pooling_method="mean",
restarts=3,
)
palgpy_instance.cross_val_points = 0
palgpy_instance_2.cross_val_points = 0
pal_ensemble = PALEnsemble([palgpy_instance, palgpy_instance_2])
pal_ensemble.update_train_set(sample_idx, y[sample_idx])
sample, _ = pal_ensemble.run_one_step(1)
assert len(sample) == 1

sample, _ = pal_ensemble.run_one_step(20)
assert len(sample) == 20