math/optimization/cma_es.hpp

0001 /*
0002  * Copyright Nick Thompson, 2024
0003  * Use, modification and distribution are subject to the
0004  * Boost Software License, Version 1.0. (See accompanying file
0005  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
0006  */
0007 #ifndef BOOST_MATH_OPTIMIZATION_CMA_ES_HPP
0008 #define BOOST_MATH_OPTIMIZATION_CMA_ES_HPP
0009 #include <atomic>
0010 #include <cmath>
0011 #include <iostream>
0012 #include <limits>
0013 #include <random>
0014 #include <sstream>
0015 #include <stdexcept>
0016 #include <utility>
0017 #include <vector>
0018 #include <boost/math/optimization/detail/common.hpp>
0019 #include <boost/math/tools/assert.hpp>
0020 #if __has_include(<Eigen/Dense>)
0021 #include <Eigen/Dense>
0022 #else
0023 #error "CMA-ES requires Eigen."
0024 #endif
0025
0026 // Follows the notation in:
0027 // https://arxiv.org/pdf/1604.00772.pdf
0028 // This is a (hopefully) faithful reproduction of the pseudocode in the arxiv review
0029 // by Nikolaus Hansen.
0030 // Comments referring to equations all refer to this arxiv review.
0031 // A slide deck by the same author is given here:
0032 // http://www.cmap.polytechnique.fr/~nikolaus.hansen/CmaTutorialGecco2023-no-audio.pdf
0033 // which is also a very useful reference.
0034
0035 #ifndef BOOST_MATH_DEBUG_CMA_ES
0036 #define BOOST_MATH_DEBUG_CMA_ES 0
0037 #endif
0038
0039 namespace boost::math::optimization {
0040
0041 template <typename ArgumentContainer> struct cma_es_parameters {
0042   using Real = typename ArgumentContainer::value_type;
0043   using DimensionlessReal = decltype(Real()/Real());
0044   ArgumentContainer lower_bounds;
0045   ArgumentContainer upper_bounds;
0046   size_t max_generations = 1000;
0047   ArgumentContainer const *initial_guess = nullptr;
0048   // In the reference, population size = \lambda.
0049   // If the population size is zero, it is set to equation (48) of the reference
0050   // and rounded up to the nearest multiple of threads:
0051   size_t population_size = 0;
0052   // In the reference, learning_rate = c_m:
0053   DimensionlessReal learning_rate = 1;
0054 };
0055
0056 template <typename ArgumentContainer>
0057 void validate_cma_es_parameters(cma_es_parameters<ArgumentContainer> &params) {
0058   using Real = typename ArgumentContainer::value_type;
0059   using DimensionlessReal = decltype(Real()/Real());
0060   using std::isfinite;
0061   using std::isnan;
0062   using std::log;
0063   using std::ceil;
0064   using std::floor;
0065
0066   std::ostringstream oss;
0067   detail::validate_bounds(params.lower_bounds, params.upper_bounds);
0068   if (params.initial_guess) {
0069     detail::validate_initial_guess(*params.initial_guess, params.lower_bounds, params.upper_bounds);
0070   }
0071   const size_t n = params.upper_bounds.size();
0072   // Equation 48 of the arxiv review:
0073   if (params.population_size == 0) {
0074     //auto tmp = 4.0 + floor(3*log(n));
0075     // But round to the nearest multiple of the thread count:
0076     //auto k = static_cast<size_t>(std::ceil(tmp/params.threads));
0077     //params.population_size = k*params.threads;
0078     params.population_size = static_cast<size_t>(4 + floor(3*log(n)));
0079   }
0080   if (params.learning_rate <= DimensionlessReal(0) || !isfinite(params.learning_rate)) {
0081     oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
0082     oss << ": The learning rate must be > 0, but got " << params.learning_rate << ".";
0083     throw std::invalid_argument(oss.str());
0084   }
0085 }
0086
0087 template <typename ArgumentContainer, class Func, class URBG>
0088 ArgumentContainer cma_es(
0089     const Func cost_function,
0090     cma_es_parameters<ArgumentContainer> &params,
0091     URBG &gen,
0092     std::invoke_result_t<Func, ArgumentContainer> target_value = std::numeric_limits<std::invoke_result_t<Func, ArgumentContainer>>::quiet_NaN(),
0093     std::atomic<bool> *cancellation = nullptr,
0094     std::atomic<std::invoke_result_t<Func, ArgumentContainer>> *current_minimum_cost = nullptr,
0095     std::vector<std::pair<ArgumentContainer, std::invoke_result_t<Func, ArgumentContainer>>> *queries = nullptr)
0096  {
0097   using Real = typename ArgumentContainer::value_type;
0098   using DimensionlessReal = decltype(Real()/Real());
0099   using ResultType = std::invoke_result_t<Func, ArgumentContainer>;
0100   using std::abs;
0101   using std::log;
0102   using std::exp;
0103   using std::pow;
0104   using std::min;
0105   using std::max;
0106   using std::sqrt;
0107   using std::isnan;
0108   using std::isfinite;
0109   using std::uniform_real_distribution;
0110   using std::normal_distribution;
0111   validate_cma_es_parameters(params);
0112   // n = dimension of problem:
0113   const size_t n = params.lower_bounds.size();
0114   std::atomic<bool> target_attained = false;
0115   std::atomic<ResultType> lowest_cost = std::numeric_limits<ResultType>::infinity();
0116   ArgumentContainer best_vector;
0117   // p_{c} := evolution path, equation (24) of the arxiv review:
0118   Eigen::Vector<DimensionlessReal, Eigen::Dynamic> p_c(n);
0119   // p_{\sigma} := conjugate evolution path, equation (31) of the arxiv review:
0120   Eigen::Vector<DimensionlessReal, Eigen::Dynamic> p_sigma(n);
0121   if constexpr (detail::has_resize_v<ArgumentContainer>) {
0122     best_vector.resize(n, std::numeric_limits<Real>::quiet_NaN());
0123   }
0124   for (size_t i = 0; i < n; ++i) {
0125     p_c[i] = DimensionlessReal(0);
0126     p_sigma[i] = DimensionlessReal(0);
0127   }
0128   // Table 1, \mu = floor(\lambda/2):
0129   size_t mu = params.population_size/2;
0130   std::vector<DimensionlessReal> w_prime(params.population_size, std::numeric_limits<DimensionlessReal>::quiet_NaN());
0131   for (size_t i = 0; i < params.population_size; ++i) {
0132     // Equation (49), but 0-indexed:
0133     w_prime[i] = log(static_cast<DimensionlessReal>(params.population_size + 1)/(2*(i+1)));
0134   }
0135   // Table 1, notes at top:
0136   DimensionlessReal positive_weight_sum = 0;
0137   DimensionlessReal sq_weight_sum = 0;
0138   for (size_t i = 0; i < mu; ++i) {
0139     BOOST_MATH_ASSERT(w_prime[i] > 0);
0140     positive_weight_sum += w_prime[i];
0141     sq_weight_sum += w_prime[i]*w_prime[i];
0142   }
0143   DimensionlessReal mu_eff = positive_weight_sum*positive_weight_sum/sq_weight_sum;
0144   BOOST_MATH_ASSERT(1 <= mu_eff);
0145   BOOST_MATH_ASSERT(mu_eff <= mu);
0146   DimensionlessReal negative_weight_sum = 0;
0147   sq_weight_sum = 0;
0148   for (size_t i = mu; i < params.population_size; ++i) {
0149     BOOST_MATH_ASSERT(w_prime[i] <= 0);
0150     negative_weight_sum += w_prime[i];
0151     sq_weight_sum += w_prime[i]*w_prime[i];
0152   }
0153   DimensionlessReal mu_eff_m = negative_weight_sum*negative_weight_sum/sq_weight_sum;
0154   // Equation (54):
0155   DimensionlessReal c_m = params.learning_rate;
0156   // Equation (55):
0157   DimensionlessReal c_sigma = (mu_eff + 2)/(n + mu_eff + 5);
0158   BOOST_MATH_ASSERT(c_sigma < 1);
0159   DimensionlessReal d_sigma = 1 + 2*(max)(DimensionlessReal(0), sqrt(DimensionlessReal((mu_eff - 1)/(n + 1))) - DimensionlessReal(1)) + c_sigma;
0160   // Equation (56):
0161   DimensionlessReal c_c = (4 + mu_eff/n)/(n + 4 + 2*mu_eff/n);
0162   BOOST_MATH_ASSERT(c_c <= 1);
0163   // Equation (57):
0164   DimensionlessReal c_1 = DimensionlessReal(2)/(pow(n + 1.3, 2) + mu_eff);
0165   // Equation (58)
0166   DimensionlessReal c_mu = (min)(1 - c_1, 2*(DimensionlessReal(0.25)  + mu_eff  + 1/mu_eff - 2)/((n+2)*(n+2) + mu_eff));
0167   BOOST_MATH_ASSERT(c_1 + c_mu <= DimensionlessReal(1));
0168   // Equation (50):
0169   DimensionlessReal alpha_mu_m = 1 + c_1/c_mu;
0170   // Equation (51):
0171   DimensionlessReal alpha_mu_eff_m = 1 + 2*mu_eff_m/(mu_eff + 2);
0172   // Equation (52):
0173   DimensionlessReal alpha_m_pos_def = (1- c_1 - c_mu)/(n*c_mu);
0174   // Equation (53):
0175   std::vector<DimensionlessReal> weights(params.population_size, std::numeric_limits<DimensionlessReal>::quiet_NaN());
0176   for (size_t i = 0; i < mu; ++i) {
0177     weights[i] = w_prime[i]/positive_weight_sum;
0178   }
0179   DimensionlessReal min_alpha = (min)(alpha_mu_m, (min)(alpha_mu_eff_m, alpha_m_pos_def));
0180   for (size_t i = mu; i < params.population_size; ++i) {
0181     weights[i] = min_alpha*w_prime[i]/abs(negative_weight_sum);
0182   }
0183   // mu:= number of parents, lambda := number of offspring.
0184   Eigen::Matrix<DimensionlessReal, Eigen::Dynamic, Eigen::Dynamic> C = Eigen::Matrix<DimensionlessReal, Eigen::Dynamic, Eigen::Dynamic>::Identity(n, n);
0185   ArgumentContainer mean_vector;
0186   // See the footnote in Figure 6 of the arxiv review:
0187   // We should consider the more robust initialization described there. . .
0188   Real sigma = DimensionlessReal(0.3)*(params.upper_bounds[0] - params.lower_bounds[0]);;
0189   if (params.initial_guess) {
0190     mean_vector = *params.initial_guess;
0191   }
0192   else {
0193     mean_vector = detail::random_initial_population(params.lower_bounds, params.upper_bounds, 1, gen)[0];
0194   }
0195   auto initial_cost = cost_function(mean_vector);
0196   if (!isnan(initial_cost)) {
0197     best_vector = mean_vector;
0198     lowest_cost = initial_cost;
0199     if (current_minimum_cost) {
0200       *current_minimum_cost = initial_cost;
0201     }
0202   }
0203 #if BOOST_MATH_DEBUG_CMA_ES
0204   {
0205     std::cout << __FILE__ << ":" << __LINE__ << ":" << __func__ << "\n";
0206     std::cout << "\tRunning a (" << params.population_size/2 << "/" << params.population_size/2 << "_W, " << params.population_size << ")-aCMA Evolutionary Strategy on " << params.threads << " threads.\n";
0207     std::cout << "\tInitial mean vector: {";
0208     for (size_t i = 0; i < n - 1; ++i) {
0209       std::cout << mean_vector[i] << ", ";
0210     }
0211     std::cout << mean_vector[n - 1] << "}.\n";
0212     std::cout << "\tCost: " << lowest_cost << ".\n";
0213     std::cout << "\tInitial step length: " << sigma << ".\n";
0214     std::cout << "\tVariance effective selection mass: " << mu_eff << ".\n";
0215     std::cout << "\tLearning rate for rank-one update of covariance matrix: " << c_1 << ".\n";
0216     std::cout << "\tLearning rate for rank-mu update of covariance matrix: " << c_mu << ".\n";
0217     std::cout << "\tDecay rate for cumulation path for step-size control: " << c_sigma << ".\n";
0218     std::cout << "\tLearning rate for the mean: " << c_m << ".\n";
0219     std::cout << "\tDamping parameter for step-size update: " << d_sigma << ".\n";
0220   }
0221 #endif
0222   size_t generation = 0;
0223
0224   std::vector<Eigen::Vector<DimensionlessReal, Eigen::Dynamic>> ys(params.population_size);
0225   std::vector<ArgumentContainer> xs(params.population_size);
0226   std::vector<ResultType> costs(params.population_size, std::numeric_limits<ResultType>::quiet_NaN());
0227   Eigen::Vector<DimensionlessReal, Eigen::Dynamic> weighted_avg_y(n);
0228   Eigen::Vector<DimensionlessReal, Eigen::Dynamic> z(n);
0229   if constexpr (detail::has_resize_v<ArgumentContainer>) {
0230     for (auto & x : xs) {
0231       x.resize(n, std::numeric_limits<Real>::quiet_NaN());
0232     }
0233   }
0234   for (auto & y : ys) {
0235     y.resize(n);
0236   }
0237   normal_distribution<DimensionlessReal> dis(DimensionlessReal(0), DimensionlessReal(1));
0238   do {
0239     if (cancellation && *cancellation) {
0240       break;
0241     }
0242     // TODO: The reference contends the following in
0243     // Section B.2 "Strategy internal numerical effort":
0244     // "In practice, the re-calculation of B and D needs to be done not until about
0245     // max(1, floor(1/(10n(c_1+c_mu)))) generations."
0246     // Note that sigma can be dimensionless, in which case C carries the units.
0247     // This is a weird decision-we're not gonna do that!
0248     Eigen::SelfAdjointEigenSolver<Eigen::Matrix<DimensionlessReal, Eigen::Dynamic, Eigen::Dynamic>> eigensolver(C);
0249     if (eigensolver.info() != Eigen::Success) {
0250       std::ostringstream oss;
0251       oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
0252       oss << ": Could not decompose the covariance matrix as BDB^{T}.";
0253       throw std::logic_error(oss.str());
0254     }
0255     Eigen::Matrix<DimensionlessReal, Eigen::Dynamic, Eigen::Dynamic> B = eigensolver.eigenvectors();
0256     // Eigen returns D^2, in the notation of the survey:
0257     auto D = eigensolver.eigenvalues();
0258     // So make it better:
0259     for (auto & d : D) {
0260       if (d <= 0 || isnan(d)) {
0261         std::ostringstream oss;
0262         oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
0263         oss << ": The covariance matrix is not positive definite. This breaks the evolution path computation downstream.\n";
0264         oss << "C=\n" << C << "\n";
0265         oss << "Eigenvalues: " << D;
0266         throw std::domain_error(oss.str());
0267       }
0268       d = sqrt(d);
0269     }
0270
0271     for (size_t k = 0; k < params.population_size; ++k) {
0272       auto & y = ys[k];
0273       auto & x = xs[k];
0274       BOOST_MATH_ASSERT(static_cast<size_t>(x.size()) == n);
0275       BOOST_MATH_ASSERT(static_cast<size_t>(y.size()) == n);
0276       size_t resample_counter = 0;
0277       do {
0278         // equation (39) of Figure 6:
0279         // Also see equation (4):
0280         for (size_t i = 0; i < n; ++i) {
0281           z[i] = dis(gen);
0282         }
0283         Eigen::Vector<DimensionlessReal, Eigen::Dynamic> Dz(n);
0284         for (size_t i = 0; i < n; ++i) {
0285           Dz[i] = D[i]*z[i];
0286         }
0287         y = B*Dz;
0288         for (size_t i = 0; i < n; ++i) {
0289           BOOST_MATH_ASSERT(!isnan(mean_vector[i]));
0290           BOOST_MATH_ASSERT(!isnan(y[i]));
0291           x[i] = mean_vector[i] + sigma*y[i]; // equation (40) of Figure 6.
0292         }
0293         costs[k] = cost_function(x);
0294         if (resample_counter++ == 50) {
0295           std::ostringstream oss;
0296           oss << __FILE__ << ":" << __LINE__ << ":" << __func__;
0297           oss << ": 50 resamples was not sufficient to find an argument to the cost function which did not return NaN.";
0298           oss << " Giving up.";
0299           throw std::domain_error(oss.str());
0300         }
0301       } while (isnan(costs[k]));
0302
0303       if (queries) {
0304         queries->emplace_back(std::make_pair(x, costs[k]));
0305       }
0306       if (costs[k] < lowest_cost) {
0307         lowest_cost = costs[k];
0308         best_vector = x;
0309         if (current_minimum_cost && costs[k] < *current_minimum_cost) {
0310           *current_minimum_cost = costs[k];
0311         }
0312         if (lowest_cost < target_value) {
0313           target_attained = true;
0314           break;
0315         }
0316       }
0317     }
0318     if (target_attained) {
0319       break;
0320     }
0321     if (cancellation && *cancellation) {
0322       break;
0323     }
0324     auto indices = detail::best_indices(costs);
0325     // Equation (41), Figure 6:
0326     for (size_t j = 0; j < n; ++j) {
0327       weighted_avg_y[j] = 0;
0328       for (size_t i = 0; i < mu; ++i) {
0329         BOOST_MATH_ASSERT(!isnan(weights[i]));
0330         BOOST_MATH_ASSERT(!isnan(ys[indices[i]][j]));
0331         weighted_avg_y[j] += weights[i]*ys[indices[i]][j];
0332       }
0333     }
0334     // Equation (42), Figure 6:
0335     for (size_t j = 0; j < n; ++j) {
0336       mean_vector[j] = mean_vector[j] + c_m*sigma*weighted_avg_y[j];
0337     }
0338     // Equation (43), Figure 6: Start with C^{-1/2}<y>_{w}
0339     Eigen::Vector<DimensionlessReal, Eigen::Dynamic> inv_D_B_transpose_y = B.transpose()*weighted_avg_y;
0340     for (long j = 0; j < inv_D_B_transpose_y.size(); ++j) {
0341       inv_D_B_transpose_y[j] /= D[j];
0342     }
0343     Eigen::Vector<DimensionlessReal, Eigen::Dynamic> C_inv_sqrt_y_avg = B*inv_D_B_transpose_y;
0344     // Equation (43), Figure 6:
0345     DimensionlessReal p_sigma_norm = 0;
0346     for (size_t j = 0; j < n; ++j) {
0347       p_sigma[j] = (1-c_sigma)*p_sigma[j] + sqrt(c_sigma*(2-c_sigma)*mu_eff)*C_inv_sqrt_y_avg[j];
0348       p_sigma_norm += p_sigma[j]*p_sigma[j];
0349     }
0350     p_sigma_norm = sqrt(p_sigma_norm);
0351     // A: Algorithm Summary: E[||N(0,1)||]:
0352     const DimensionlessReal expectation_norm_0I = sqrt(static_cast<DimensionlessReal>(n))*(DimensionlessReal(1) - DimensionlessReal(1)/(4*n) + DimensionlessReal(1)/(21*n*n));
0353     // Equation (44), Figure 6:
0354     sigma = sigma*exp(c_sigma*(p_sigma_norm/expectation_norm_0I -1)/d_sigma);
0355     // A: Algorithm Summary:
0356     DimensionlessReal h_sigma = 0;
0357     DimensionlessReal rhs = (DimensionlessReal(1.4) + DimensionlessReal(2)/(n+1))*expectation_norm_0I*sqrt(1 - pow(1-c_sigma, 2*(generation+1)));
0358     if (p_sigma_norm < rhs) {
0359       h_sigma = 1;
0360     }
0361     // Equation (45), Figure 6:
0362     p_c = (1-c_c)*p_c + h_sigma*sqrt(c_c*(2-c_c)*mu_eff)*weighted_avg_y;
0363     DimensionlessReal delta_h_sigma = (1-h_sigma)*c_c*(2-c_c);
0364     DimensionlessReal weight_sum = 0;
0365     for (auto & w : weights) {
0366       weight_sum += w;
0367     }
0368     // Equation (47), Figure 6:
0369     DimensionlessReal K = (1 + c_1*delta_h_sigma - c_1 - c_mu*weight_sum);
0370     // Can these operations be sped up using `.selfadjointView<Eigen::Upper>`?
0371     // Maybe: A.selfadjointView<Eigen::Lower>().rankUpdate(p_c, c_1);?
0372     C = K*C + c_1*p_c*p_c.transpose();
0373     // Incorporate positive weights of Equation (46):
0374     for (size_t i = 0; i < params.population_size/2; ++i) {
0375       C += c_mu*weights[i]*ys[indices[i]]*ys[indices[i]].transpose();
0376     }
0377     for (size_t i = params.population_size/2; i < params.population_size; ++i) {
0378       Eigen::Vector<DimensionlessReal, Eigen::Dynamic> D_inv_BTy = B.transpose()*ys[indices[i]];
0379       for (size_t j = 0; j < n; ++j) {
0380         D_inv_BTy[j] /= D[j];
0381       }
0382       DimensionlessReal squared_norm = D_inv_BTy.squaredNorm();
0383       DimensionlessReal K2 = c_mu*weights[i]/squared_norm;
0384       C += K2*ys[indices[i]]*ys[indices[i]].transpose();
0385     }
0386   } while (generation++ < params.max_generations);
0387
0388   return best_vector;
0389 }
0390
0391 } // namespace boost::math::optimization
0392 #endif