diff --git a/README.md b/README.md index 19193a2..01d07f1 100644 --- a/README.md +++ b/README.md @@ -243,7 +243,7 @@ Source code is also available [here](./examples/cmaes_with_margin.py). -#### CatCMA [Hamano et al. 2024] +#### CatCMA [Hamano et al. 2024a] CatCMA is a method for mixed-category optimization problems, which is the problem of simultaneously optimizing continuous and categorical variables. CatCMA employs the joint probability distribution of multivariate Gaussian and categorical distributions as the search distribution. ![CatCMA](https://github.com/CyberAgentAILab/cmaes/assets/27720055/f91443b6-d71b-4849-bfc3-095864f7c58c) @@ -317,6 +317,54 @@ The full source code is available [here](./examples/catcma.py). +#### Maximum a Posteriori CMA-ES [Hamano et al. 2024b] +MAP-CMA is a method that is introduced to interpret the rank-one update in the CMA-ES from the perspective of the natural gradient. +The rank-one update derived from the natural gradient perspective is extensible, and an additional term, called momentum update, appears in the update of the mean vector. +The performance of MAP-CMA is not significantly different from that of CMA-ES, as the primary motivation for MAP-CMA comes from the theoretical understanding of CMA-ES. + +
+ +Source code + +```python +import numpy as np +from cmaes import MAPCMA + + +def rosenbrock(x): + dim = len(x) + if dim < 2: + raise ValueError("dimension must be greater one") + return sum(100 * (x[:-1] ** 2 - x[1:]) ** 2 + (x[:-1] - 1) ** 2) + + +if __name__ == "__main__": + dim = 20 + optimizer = MAPCMA(mean=np.zeros(dim), sigma=0.5, momentum_r=dim) + print(" evals f(x)") + print("====== ==========") + + evals = 0 + while True: + solutions = [] + for _ in range(optimizer.population_size): + x = optimizer.ask() + value = rosenbrock(x) + evals += 1 + solutions.append((x, value)) + if evals % 1000 == 0: + print(f"{evals:5d} {value:10.5f}") + optimizer.tell(solutions) + + if optimizer.should_stop(): + break +``` + +The full source code is available [here](./examples/mapcma.py). + +
+ + #### Separable CMA-ES [Ros and Hansen 2008] Sep-CMA-ES is an algorithm that limits the covariance matrix to a diagonal form. @@ -455,7 +503,8 @@ We have great respect for all libraries involved in CMA-ES. * [Akiba et al. 2019] [T. Akiba, S. Sano, T. Yanase, T. Ohta, M. Koyama, Optuna: A Next-generation Hyperparameter Optimization Framework, KDD, 2019.](https://dl.acm.org/citation.cfm?id=3330701) * [Auger and Hansen 2005] [A. Auger, N. Hansen, A Restart CMA Evolution Strategy with Increasing Population Size, CEC, 2005.](http://www.cmap.polytechnique.fr/~nikolaus.hansen/cec2005ipopcmaes.pdf) * [Hamano et al. 2022] [R. Hamano, S. Saito, M. Nomura, S. Shirakawa, CMA-ES with Margin: Lower-Bounding Marginal Probability for Mixed-Integer Black-Box Optimization, GECCO, 2022.](https://arxiv.org/abs/2205.13482) -* [Hamano et al. 2024] [R. Hamano, S. Saito, M. Nomura, K. Uchida, S. Shirakawa, CatCMA : Stochastic Optimization for Mixed-Category Problems, GECCO, 2024.](https://arxiv.org/abs/2405.09962) +* [Hamano et al. 2024a] [R. Hamano, S. Saito, M. Nomura, K. Uchida, S. Shirakawa, CatCMA : Stochastic Optimization for Mixed-Category Problems, GECCO, 2024.](https://arxiv.org/abs/2405.09962) +* [Hamano et al. 2024b] [R. Hamano, S. Shirakawa, M. Nomura, Natural Gradient Interpretation of Rank-One Update in CMA-ES, PPSN, 2024.](https://arxiv.org/abs/2406.16506) * [Hansen 2016] [N. Hansen, The CMA Evolution Strategy: A Tutorial. arXiv:1604.00772, 2016.](https://arxiv.org/abs/1604.00772) * [Nomura et al. 2021] [M. Nomura, S. Watanabe, Y. Akimoto, Y. Ozaki, M. Onishi, Warm Starting CMA-ES for Hyperparameter Optimization, AAAI, 2021.](https://arxiv.org/abs/2012.06932) * [Nomura et al. 2023] [M. Nomura, Y. Akimoto, I. Ono, CMA-ES with Learning diff --git a/cmaes/__init__.py b/cmaes/__init__.py index 9676cb6..78e3b4e 100644 --- a/cmaes/__init__.py +++ b/cmaes/__init__.py @@ -5,5 +5,6 @@ from ._xnes import XNES # NOQA from ._dxnesic import DXNESIC # NOQA from ._catcma import CatCMA # NOQA +from ._mapcma import MAPCMA # NOQA __version__ = "0.10.0" diff --git a/cmaes/_mapcma.py b/cmaes/_mapcma.py new file mode 100644 index 0000000..08dcf72 --- /dev/null +++ b/cmaes/_mapcma.py @@ -0,0 +1,430 @@ +from __future__ import annotations + +import math +import numpy as np + +from typing import Any +from typing import cast +from typing import Optional + + +_EPS = 1e-8 +_MEAN_MAX = 1e32 +_SIGMA_MAX = 1e32 + + +class MAPCMA: + """MAP-CMA stochastic optimizer class with ask-and-tell interface. + The only difference from the CMA-ES is the additional term in the mean vector update. + + Example: + + .. code:: + + import numpy as np + from cmaes import MAPCMA + + def quadratic(x1, x2): + return (x1 - 3) ** 2 + (10 * (x2 + 2)) ** 2 + + optimizer = MAPCMA(mean=np.zeros(2), sigma=1.3) + + for generation in range(50): + solutions = [] + for _ in range(optimizer.population_size): + # Ask a parameter + x = optimizer.ask() + value = quadratic(x[0], x[1]) + solutions.append((x, value)) + print(f"#{generation} {value} (x1={x[0]}, x2 = {x[1]})") + + # Tell evaluation values. + optimizer.tell(solutions) + + Args: + + mean: + Initial mean vector of multi-variate gaussian distributions. + + sigma: + Initial standard deviation of covariance matrix. + + bounds: + Lower and upper domain boundaries for each parameter (optional). + + n_max_resampling: + A maximum number of resampling parameters (default: 100). + If all sampled parameters are infeasible, the last sampled one + will be clipped with lower and upper bounds. + + seed: + A seed number (optional). + + population_size: + A population size (optional). + + cov: + A covariance matrix (optional). + + momentum_r: + Scaling ratio of momentum update (optional). + """ + + # Paper: https://arxiv.org/abs/2406.16506 + + def __init__( + self, + mean: np.ndarray, + sigma: float, + bounds: Optional[np.ndarray] = None, + n_max_resampling: int = 100, + seed: Optional[int] = None, + population_size: Optional[int] = None, + cov: Optional[np.ndarray] = None, + momentum_r: Optional[float] = None, + ): + assert sigma > 0, "sigma must be non-zero positive value" + + assert np.all( + np.abs(mean) < _MEAN_MAX + ), f"Abs of all elements of mean vector must be less than {_MEAN_MAX}" + + n_dim = len(mean) + assert n_dim > 1, "The dimension of mean must be larger than 1" + + if population_size is None: + population_size = 4 + math.floor(3 * math.log(n_dim)) + assert population_size > 0, "popsize must be non-zero positive value." + + mu = population_size // 2 + + # MAPCMA uses positive weights, in accordance with the paper + # (CMA uses negative weights) + weights_prime = np.array( + [ + math.log((population_size + 1) / 2) - math.log(i + 1) if i < mu else 0 + for i in range(population_size) + ] + ) + weights = weights_prime / weights_prime.sum() + mu_eff = 1 / ((weights**2).sum()) + + # learning rate for the rank-one update + alpha_cov = 2 + c1 = alpha_cov / ((n_dim + 1.3) ** 2 + mu_eff) + # learning rate for the rank-μ update + cmu = min( + 1 - c1 - 1e-8, # 1e-8 is for large popsize. + alpha_cov + * (mu_eff - 2 + 1 / mu_eff) + / ((n_dim + 2) ** 2 + alpha_cov * mu_eff / 2), + ) + assert c1 <= 1 - cmu, "invalid learning rate for the rank-one update" + assert cmu <= 1 - c1, "invalid learning rate for the rank-μ update" + + # scaling ratio of momentum update + if momentum_r is None: + momentum_r = n_dim + assert ( + momentum_r > 0 + ), "scaling ratio of momentum update must be non-zero positive value." + self._r = momentum_r + + # learning rate for the cumulation for the step-size control + c_sigma = (mu_eff + 2) / (n_dim + mu_eff + 5) + d_sigma = 1 + 2 * max(0, math.sqrt((mu_eff - 1) / (n_dim + 1)) - 1) + c_sigma + assert ( + c_sigma < 1 + ), "invalid learning rate for cumulation for the step-size control" + + # learning rate for cumulation for the rank-one update + cc = (4 + mu_eff / n_dim) / (n_dim + 4 + 2 * mu_eff / n_dim) + assert cc <= 1, "invalid learning rate for cumulation for the rank-one update" + + self._n_dim = n_dim + self._popsize = population_size + self._mu = mu + self._mu_eff = mu_eff + + self._cc = cc + self._c1 = c1 + self._cmu = cmu + self._c_sigma = c_sigma + self._d_sigma = d_sigma + # ensuring cm + cm * c1 / (r * cmu) = 1 + self._cm = 1 / (1 + c1 / (self._r * cmu)) + + # E||N(0, I)|| + self._chi_n = math.sqrt(self._n_dim) * ( + 1.0 - (1.0 / (4.0 * self._n_dim)) + 1.0 / (21.0 * (self._n_dim**2)) + ) + + self._weights = weights + + # evolution path + self._p_sigma = np.zeros(n_dim) + self._pc = np.zeros(n_dim) + + self._mean = mean.copy() + + if cov is None: + self._C = np.eye(n_dim) + else: + assert cov.shape == (n_dim, n_dim), "Invalid shape of covariance matrix" + self._C = cov + + self._sigma = sigma + self._D: Optional[np.ndarray] = None + self._B: Optional[np.ndarray] = None + + # bounds contains low and high of each parameter. + assert bounds is None or _is_valid_bounds(bounds, mean), "invalid bounds" + self._bounds = bounds + self._n_max_resampling = n_max_resampling + + self._g = 0 + self._rng = np.random.RandomState(seed) + + # Termination criteria + self._tolx = 1e-12 * sigma + self._tolxup = 1e4 + self._tolfun = 1e-12 + self._tolconditioncov = 1e14 + + self._funhist_term = 10 + math.ceil(30 * n_dim / population_size) + self._funhist_values = np.empty(self._funhist_term * 2) + + def __getstate__(self) -> dict[str, Any]: + attrs = {} + for name in self.__dict__: + # Remove _rng in pickle serialized object. + if name == "_rng": + continue + if name == "_C": + sym1d = _compress_symmetric(self._C) + attrs["_c_1d"] = sym1d + continue + attrs[name] = getattr(self, name) + return attrs + + def __setstate__(self, state: dict[str, Any]) -> None: + state["_C"] = _decompress_symmetric(state["_c_1d"]) + del state["_c_1d"] + self.__dict__.update(state) + # Set _rng for unpickled object. + setattr(self, "_rng", np.random.RandomState()) + + @property + def dim(self) -> int: + """A number of dimensions""" + return self._n_dim + + @property + def population_size(self) -> int: + """A population size""" + return self._popsize + + @property + def generation(self) -> int: + """Generation number which is monotonically incremented + when multi-variate gaussian distribution is updated.""" + return self._g + + @property + def mean(self) -> np.ndarray: + """Mean Vector""" + return self._mean + + def reseed_rng(self, seed: int) -> None: + self._rng.seed(seed) + + def set_bounds(self, bounds: Optional[np.ndarray]) -> None: + """Update boundary constraints""" + assert bounds is None or _is_valid_bounds(bounds, self._mean), "invalid bounds" + self._bounds = bounds + + def ask(self) -> np.ndarray: + """Sample a parameter""" + for i in range(self._n_max_resampling): + x = self._sample_solution() + if self._is_feasible(x): + return x + x = self._sample_solution() + x = self._repair_infeasible_params(x) + return x + + def _eigen_decomposition(self) -> tuple[np.ndarray, np.ndarray]: + if self._B is not None and self._D is not None: + return self._B, self._D + + self._C = (self._C + self._C.T) / 2 + D2, B = np.linalg.eigh(self._C) + D = np.sqrt(np.where(D2 < 0, _EPS, D2)) + self._C = np.dot(np.dot(B, np.diag(D**2)), B.T) + + self._B, self._D = B, D + return B, D + + def _sample_solution(self) -> np.ndarray: + B, D = self._eigen_decomposition() + z = self._rng.randn(self._n_dim) # ~ N(0, I) + y = cast(np.ndarray, B.dot(np.diag(D))).dot(z) # ~ N(0, C) + x = self._mean + self._sigma * y # ~ N(m, σ^2 C) + return x + + def _is_feasible(self, param: np.ndarray) -> bool: + if self._bounds is None: + return True + return cast( + bool, + np.all(param >= self._bounds[:, 0]) and np.all(param <= self._bounds[:, 1]), + ) # Cast bool_ to bool. + + def _repair_infeasible_params(self, param: np.ndarray) -> np.ndarray: + if self._bounds is None: + return param + + # clip with lower and upper bound. + param = np.where(param < self._bounds[:, 0], self._bounds[:, 0], param) + param = np.where(param > self._bounds[:, 1], self._bounds[:, 1], param) + return param + + def tell(self, solutions: list[tuple[np.ndarray, float]]) -> None: + """Tell evaluation values""" + + assert len(solutions) == self._popsize, "Must tell popsize-length solutions." + for s in solutions: + assert np.all( + np.abs(s[0]) < _MEAN_MAX + ), f"Abs of all param values must be less than {_MEAN_MAX} to avoid overflow errors" + + self._g += 1 + solutions.sort(key=lambda s: s[1]) + + # Stores 'best' and 'worst' values of the + # last 'self._funhist_term' generations. + funhist_idx = 2 * (self.generation % self._funhist_term) + self._funhist_values[funhist_idx] = solutions[0][1] + self._funhist_values[funhist_idx + 1] = solutions[-1][1] + + # Sample new population of search_points, for k=1, ..., popsize + B, D = self._eigen_decomposition() + self._B, self._D = None, None + + x_k = np.array([s[0] for s in solutions]) # ~ N(m, σ^2 C) + y_k = (x_k - self._mean) / self._sigma # ~ N(0, C) + + # Selection and recombination + y_w = np.sum(y_k.T * self._weights, axis=1) + + # Evolution paths + # MAP-CMA does not employ the Heaviside function h_sigma for simplifying the update rules. + C_2 = cast( + np.ndarray, cast(np.ndarray, B.dot(np.diag(1 / D))).dot(B.T) + ) # C^(-1/2) = B D^(-1) B^T + self._p_sigma = (1 - self._c_sigma) * self._p_sigma + math.sqrt( + self._c_sigma * (2 - self._c_sigma) * self._mu_eff + ) * C_2.dot(y_w) + + self._pc = (1 - self._cc) * self._pc + math.sqrt( + self._cc * (2 - self._cc) * self._mu_eff + ) * y_w + + # Mean vector update (rank-μ + momentum update) + self._mean += self._cm * ( + self._sigma * y_w + self._c1 / self._r / self._cmu * self._sigma * self._pc + ) + + # Covariance matrix adaption + rank_one = np.outer(self._pc, self._pc) + rank_mu = np.sum( + np.array([w * np.outer(y, y) for w, y in zip(self._weights, y_k)]), axis=0 + ) + self._C = ( + (1 - self._c1 - self._cmu * np.sum(self._weights)) * self._C + + self._c1 * rank_one + + self._cmu * rank_mu + ) + + # Step-size control + norm_p_sigma = np.linalg.norm(self._p_sigma) + self._sigma *= np.exp( + (self._c_sigma / self._d_sigma) * (norm_p_sigma / self._chi_n - 1) + ) + self._sigma = min(self._sigma, _SIGMA_MAX) + + def should_stop(self) -> bool: + B, D = self._eigen_decomposition() + dC = np.diag(self._C) + + # Stop if the range of function values of the recent generation is below tolfun. + if ( + self.generation > self._funhist_term + and np.max(self._funhist_values) - np.min(self._funhist_values) + < self._tolfun + ): + return True + + # Stop if the std of the normal distribution is smaller than tolx + # in all coordinates and pc is smaller than tolx in all components. + if np.all(self._sigma * dC < self._tolx) and np.all( + self._sigma * self._pc < self._tolx + ): + return True + + # Stop if detecting divergent behavior. + if self._sigma * np.max(D) > self._tolxup: + return True + + # No effect coordinates: stop if adding 0.2-standard deviations + # in any single coordinate does not change m. + if np.any(self._mean == self._mean + (0.2 * self._sigma * np.sqrt(dC))): + return True + + # No effect axis: stop if adding 0.1-standard deviation vector in + # any principal axis direction of C does not change m. "pycma" check + # axis one by one at each generation. + i = self.generation % self.dim + if np.all(self._mean == self._mean + (0.1 * self._sigma * D[i] * B[:, i])): + return True + + # Stop if the condition number of the covariance matrix exceeds 1e14. + condition_cov = np.max(D) / np.min(D) + if condition_cov > self._tolconditioncov: + return True + + return False + + +def _is_valid_bounds(bounds: Optional[np.ndarray], mean: np.ndarray) -> bool: + if bounds is None: + return True + if (mean.size, 2) != bounds.shape: + return False + if not np.all(bounds[:, 0] <= mean): + return False + if not np.all(mean <= bounds[:, 1]): + return False + return True + + +def _compress_symmetric(sym2d: np.ndarray) -> np.ndarray: + assert len(sym2d.shape) == 2 and sym2d.shape[0] == sym2d.shape[1] + n = sym2d.shape[0] + dim = (n * (n + 1)) // 2 + sym1d = np.zeros(dim) + start = 0 + for i in range(n): + sym1d[start : start + n - i] = sym2d[i][i:] # noqa: E203 + start += n - i + return sym1d + + +def _decompress_symmetric(sym1d: np.ndarray) -> np.ndarray: + n = int(np.sqrt(sym1d.size * 2)) + assert (n * (n + 1)) // 2 == sym1d.size + R, C = np.triu_indices(n) + out = np.zeros((n, n), dtype=sym1d.dtype) + out[R, C] = sym1d + out[C, R] = sym1d + return out diff --git a/examples/mapcma.py b/examples/mapcma.py new file mode 100644 index 0000000..e055b80 --- /dev/null +++ b/examples/mapcma.py @@ -0,0 +1,31 @@ +import numpy as np +from cmaes import MAPCMA + + +def rosenbrock(x): + dim = len(x) + if dim < 2: + raise ValueError("dimension must be greater one") + return sum(100 * (x[:-1] ** 2 - x[1:]) ** 2 + (x[:-1] - 1) ** 2) + + +if __name__ == "__main__": + dim = 20 + optimizer = MAPCMA(mean=np.zeros(dim), sigma=0.5, momentum_r=dim) + print(" evals f(x)") + print("====== ==========") + + evals = 0 + while True: + solutions = [] + for _ in range(optimizer.population_size): + x = optimizer.ask() + value = rosenbrock(x) + evals += 1 + solutions.append((x, value)) + if evals % 1000 == 0: + print(f"{evals:5d} {value:10.5f}") + optimizer.tell(solutions) + + if optimizer.should_stop(): + break