Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 1 | // Ceres Solver - A fast non-linear least squares minimizer |
Keir Mierle | 7492b0d | 2015-03-17 22:30:16 -0700 | [diff] [blame] | 2 | // Copyright 2015 Google Inc. All rights reserved. |
| 3 | // http://ceres-solver.org/ |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 4 | // |
| 5 | // Redistribution and use in source and binary forms, with or without |
| 6 | // modification, are permitted provided that the following conditions are met: |
| 7 | // |
| 8 | // * Redistributions of source code must retain the above copyright notice, |
| 9 | // this list of conditions and the following disclaimer. |
| 10 | // * Redistributions in binary form must reproduce the above copyright notice, |
| 11 | // this list of conditions and the following disclaimer in the documentation |
| 12 | // and/or other materials provided with the distribution. |
| 13 | // * Neither the name of Google Inc. nor the names of its contributors may be |
| 14 | // used to endorse or promote products derived from this software without |
| 15 | // specific prior written permission. |
| 16 | // |
| 17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 18 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 19 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 20 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 21 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 22 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 23 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 24 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 25 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 26 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 27 | // POSSIBILITY OF SUCH DAMAGE. |
| 28 | // |
| 29 | // Author: sameeragarwal@google.com (Sameer Agarwal) |
| 30 | |
| 31 | #include "ceres/line_search_direction.h" |
| 32 | #include "ceres/line_search_minimizer.h" |
| 33 | #include "ceres/low_rank_inverse_hessian.h" |
| 34 | #include "ceres/internal/eigen.h" |
| 35 | #include "glog/logging.h" |
| 36 | |
| 37 | namespace ceres { |
| 38 | namespace internal { |
| 39 | |
| 40 | class SteepestDescent : public LineSearchDirection { |
| 41 | public: |
| 42 | virtual ~SteepestDescent() {} |
| 43 | bool NextDirection(const LineSearchMinimizer::State& previous, |
| 44 | const LineSearchMinimizer::State& current, |
| 45 | Vector* search_direction) { |
| 46 | *search_direction = -current.gradient; |
| 47 | return true; |
| 48 | } |
| 49 | }; |
| 50 | |
| 51 | class NonlinearConjugateGradient : public LineSearchDirection { |
| 52 | public: |
| 53 | NonlinearConjugateGradient(const NonlinearConjugateGradientType type, |
| 54 | const double function_tolerance) |
| 55 | : type_(type), |
| 56 | function_tolerance_(function_tolerance) { |
| 57 | } |
| 58 | |
| 59 | bool NextDirection(const LineSearchMinimizer::State& previous, |
| 60 | const LineSearchMinimizer::State& current, |
| 61 | Vector* search_direction) { |
| 62 | double beta = 0.0; |
| 63 | Vector gradient_change; |
| 64 | switch (type_) { |
| 65 | case FLETCHER_REEVES: |
| 66 | beta = current.gradient_squared_norm / previous.gradient_squared_norm; |
| 67 | break; |
Sameer Agarwal | c8063df | 2014-06-03 20:26:40 -0700 | [diff] [blame] | 68 | case POLAK_RIBIERE: |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 69 | gradient_change = current.gradient - previous.gradient; |
| 70 | beta = (current.gradient.dot(gradient_change) / |
| 71 | previous.gradient_squared_norm); |
| 72 | break; |
| 73 | case HESTENES_STIEFEL: |
| 74 | gradient_change = current.gradient - previous.gradient; |
| 75 | beta = (current.gradient.dot(gradient_change) / |
| 76 | previous.search_direction.dot(gradient_change)); |
| 77 | break; |
| 78 | default: |
| 79 | LOG(FATAL) << "Unknown nonlinear conjugate gradient type: " << type_; |
| 80 | } |
| 81 | |
| 82 | *search_direction = -current.gradient + beta * previous.search_direction; |
Sameer Agarwal | 509f68c | 2013-02-20 01:39:03 -0800 | [diff] [blame] | 83 | const double directional_derivative = |
Sameer Agarwal | 931c309 | 2013-02-25 09:46:21 -0800 | [diff] [blame] | 84 | current.gradient.dot(*search_direction); |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 85 | if (directional_derivative > -function_tolerance_) { |
| 86 | LOG(WARNING) << "Restarting non-linear conjugate gradients: " |
| 87 | << directional_derivative; |
| 88 | *search_direction = -current.gradient; |
Sameer Agarwal | bcc865f | 2014-12-17 07:35:09 -0800 | [diff] [blame] | 89 | } |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 90 | |
| 91 | return true; |
| 92 | } |
| 93 | |
| 94 | private: |
| 95 | const NonlinearConjugateGradientType type_; |
| 96 | const double function_tolerance_; |
| 97 | }; |
| 98 | |
| 99 | class LBFGS : public LineSearchDirection { |
| 100 | public: |
Alex Stewart | 9aa0e3c | 2013-07-05 20:22:37 +0100 | [diff] [blame] | 101 | LBFGS(const int num_parameters, |
| 102 | const int max_lbfgs_rank, |
| 103 | const bool use_approximate_eigenvalue_bfgs_scaling) |
| 104 | : low_rank_inverse_hessian_(num_parameters, |
| 105 | max_lbfgs_rank, |
| 106 | use_approximate_eigenvalue_bfgs_scaling), |
| 107 | is_positive_definite_(true) {} |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 108 | |
| 109 | virtual ~LBFGS() {} |
| 110 | |
| 111 | bool NextDirection(const LineSearchMinimizer::State& previous, |
| 112 | const LineSearchMinimizer::State& current, |
| 113 | Vector* search_direction) { |
Alex Stewart | 9aa0e3c | 2013-07-05 20:22:37 +0100 | [diff] [blame] | 114 | CHECK(is_positive_definite_) |
| 115 | << "Ceres bug: NextDirection() called on L-BFGS after inverse Hessian " |
| 116 | << "approximation has become indefinite, please contact the " |
| 117 | << "developers!"; |
| 118 | |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 119 | low_rank_inverse_hessian_.Update( |
| 120 | previous.search_direction * previous.step_size, |
| 121 | current.gradient - previous.gradient); |
Alex Stewart | 3fca2c4 | 2013-11-18 10:26:49 +0000 | [diff] [blame] | 122 | |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 123 | search_direction->setZero(); |
| 124 | low_rank_inverse_hessian_.RightMultiply(current.gradient.data(), |
| 125 | search_direction->data()); |
| 126 | *search_direction *= -1.0; |
Alex Stewart | 9aa0e3c | 2013-07-05 20:22:37 +0100 | [diff] [blame] | 127 | |
| 128 | if (search_direction->dot(current.gradient) >= 0.0) { |
| 129 | LOG(WARNING) << "Numerical failure in L-BFGS update: inverse Hessian " |
| 130 | << "approximation is not positive definite, and thus " |
| 131 | << "initial gradient for search direction is positive: " |
| 132 | << search_direction->dot(current.gradient); |
| 133 | is_positive_definite_ = false; |
| 134 | return false; |
| 135 | } |
| 136 | |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 137 | return true; |
| 138 | } |
| 139 | |
| 140 | private: |
| 141 | LowRankInverseHessian low_rank_inverse_hessian_; |
Alex Stewart | 9aa0e3c | 2013-07-05 20:22:37 +0100 | [diff] [blame] | 142 | bool is_positive_definite_; |
| 143 | }; |
| 144 | |
| 145 | class BFGS : public LineSearchDirection { |
| 146 | public: |
| 147 | BFGS(const int num_parameters, |
| 148 | const bool use_approximate_eigenvalue_scaling) |
| 149 | : num_parameters_(num_parameters), |
| 150 | use_approximate_eigenvalue_scaling_(use_approximate_eigenvalue_scaling), |
| 151 | initialized_(false), |
| 152 | is_positive_definite_(true) { |
| 153 | LOG_IF(WARNING, num_parameters_ >= 1e3) |
| 154 | << "BFGS line search being created with: " << num_parameters_ |
| 155 | << " parameters, this will allocate a dense approximate inverse Hessian" |
| 156 | << " of size: " << num_parameters_ << " x " << num_parameters_ |
| 157 | << ", consider using the L-BFGS memory-efficient line search direction " |
| 158 | << "instead."; |
| 159 | // Construct inverse_hessian_ after logging warning about size s.t. if the |
| 160 | // allocation crashes us, the log will highlight what the issue likely was. |
| 161 | inverse_hessian_ = Matrix::Identity(num_parameters, num_parameters); |
| 162 | } |
| 163 | |
| 164 | virtual ~BFGS() {} |
| 165 | |
| 166 | bool NextDirection(const LineSearchMinimizer::State& previous, |
| 167 | const LineSearchMinimizer::State& current, |
| 168 | Vector* search_direction) { |
| 169 | CHECK(is_positive_definite_) |
| 170 | << "Ceres bug: NextDirection() called on BFGS after inverse Hessian " |
| 171 | << "approximation has become indefinite, please contact the " |
| 172 | << "developers!"; |
| 173 | |
| 174 | const Vector delta_x = previous.search_direction * previous.step_size; |
| 175 | const Vector delta_gradient = current.gradient - previous.gradient; |
| 176 | const double delta_x_dot_delta_gradient = delta_x.dot(delta_gradient); |
| 177 | |
Alex Stewart | 3fca2c4 | 2013-11-18 10:26:49 +0000 | [diff] [blame] | 178 | // The (L)BFGS algorithm explicitly requires that the secant equation: |
| 179 | // |
| 180 | // B_{k+1} * s_k = y_k |
| 181 | // |
| 182 | // Is satisfied at each iteration, where B_{k+1} is the approximated |
| 183 | // Hessian at the k+1-th iteration, s_k = (x_{k+1} - x_{k}) and |
| 184 | // y_k = (grad_{k+1} - grad_{k}). As the approximated Hessian must be |
| 185 | // positive definite, this is equivalent to the condition: |
| 186 | // |
| 187 | // s_k^T * y_k > 0 [s_k^T * B_{k+1} * s_k = s_k^T * y_k > 0] |
| 188 | // |
| 189 | // This condition would always be satisfied if the function was strictly |
| 190 | // convex, alternatively, it is always satisfied provided that a Wolfe line |
| 191 | // search is used (even if the function is not strictly convex). See [1] |
| 192 | // (p138) for a proof. |
| 193 | // |
| 194 | // Although Ceres will always use a Wolfe line search when using (L)BFGS, |
| 195 | // practical implementation considerations mean that the line search |
| 196 | // may return a point that satisfies only the Armijo condition, and thus |
| 197 | // could violate the Secant equation. As such, we will only use a step |
| 198 | // to update the Hessian approximation if: |
| 199 | // |
| 200 | // s_k^T * y_k > tolerance |
| 201 | // |
| 202 | // It is important that tolerance is very small (and >=0), as otherwise we |
| 203 | // might skip the update too often and fail to capture important curvature |
| 204 | // information in the Hessian. For example going from 1e-10 -> 1e-14 |
| 205 | // improves the NIST benchmark score from 43/54 to 53/54. |
| 206 | // |
| 207 | // [1] Nocedal J, Wright S, Numerical Optimization, 2nd Ed. Springer, 1999. |
| 208 | // |
Sameer Agarwal | 89a592f | 2013-11-26 11:35:49 -0800 | [diff] [blame] | 209 | // TODO(alexs.mac): Consider using Damped BFGS update instead of |
| 210 | // skipping update. |
Alex Stewart | 3fca2c4 | 2013-11-18 10:26:49 +0000 | [diff] [blame] | 211 | const double kBFGSSecantConditionHessianUpdateTolerance = 1e-14; |
| 212 | if (delta_x_dot_delta_gradient <= |
| 213 | kBFGSSecantConditionHessianUpdateTolerance) { |
Alex Stewart | 331ff09 | 2013-11-25 13:44:53 +0000 | [diff] [blame] | 214 | VLOG(2) << "Skipping BFGS Update, delta_x_dot_delta_gradient too " |
| 215 | << "small: " << delta_x_dot_delta_gradient << ", tolerance: " |
| 216 | << kBFGSSecantConditionHessianUpdateTolerance |
| 217 | << " (Secant condition)."; |
Alex Stewart | 9aa0e3c | 2013-07-05 20:22:37 +0100 | [diff] [blame] | 218 | } else { |
| 219 | // Update dense inverse Hessian approximation. |
| 220 | |
| 221 | if (!initialized_ && use_approximate_eigenvalue_scaling_) { |
| 222 | // Rescale the initial inverse Hessian approximation (H_0) to be |
| 223 | // iteratively updated so that it is of similar 'size' to the true |
| 224 | // inverse Hessian at the start point. As shown in [1]: |
| 225 | // |
| 226 | // \gamma = (delta_gradient_{0}' * delta_x_{0}) / |
| 227 | // (delta_gradient_{0}' * delta_gradient_{0}) |
| 228 | // |
| 229 | // Satisfies: |
| 230 | // |
| 231 | // (1 / \lambda_m) <= \gamma <= (1 / \lambda_1) |
| 232 | // |
| 233 | // Where \lambda_1 & \lambda_m are the smallest and largest eigenvalues |
| 234 | // of the true initial Hessian (not the inverse) respectively. Thus, |
| 235 | // \gamma is an approximate eigenvalue of the true inverse Hessian, and |
| 236 | // choosing: H_0 = I * \gamma will yield a starting point that has a |
| 237 | // similar scale to the true inverse Hessian. This technique is widely |
| 238 | // reported to often improve convergence, however this is not |
| 239 | // universally true, particularly if there are errors in the initial |
| 240 | // gradients, or if there are significant differences in the sensitivity |
| 241 | // of the problem to the parameters (i.e. the range of the magnitudes of |
| 242 | // the components of the gradient is large). |
| 243 | // |
| 244 | // The original origin of this rescaling trick is somewhat unclear, the |
| 245 | // earliest reference appears to be Oren [1], however it is widely |
| 246 | // discussed without specific attributation in various texts including |
| 247 | // [2] (p143). |
| 248 | // |
| 249 | // [1] Oren S.S., Self-scaling variable metric (SSVM) algorithms |
| 250 | // Part II: Implementation and experiments, Management Science, |
| 251 | // 20(5), 863-874, 1974. |
| 252 | // [2] Nocedal J., Wright S., Numerical Optimization, Springer, 1999. |
Alex Stewart | 40ef903 | 2013-11-25 16:36:40 +0000 | [diff] [blame] | 253 | const double approximate_eigenvalue_scale = |
Alex Stewart | 9aa0e3c | 2013-07-05 20:22:37 +0100 | [diff] [blame] | 254 | delta_x_dot_delta_gradient / delta_gradient.dot(delta_gradient); |
Alex Stewart | 40ef903 | 2013-11-25 16:36:40 +0000 | [diff] [blame] | 255 | inverse_hessian_ *= approximate_eigenvalue_scale; |
| 256 | |
| 257 | VLOG(4) << "Applying approximate_eigenvalue_scale: " |
| 258 | << approximate_eigenvalue_scale << " to initial inverse " |
| 259 | << "Hessian approximation."; |
Alex Stewart | 9aa0e3c | 2013-07-05 20:22:37 +0100 | [diff] [blame] | 260 | } |
| 261 | initialized_ = true; |
| 262 | |
| 263 | // Efficient O(num_parameters^2) BFGS update [2]. |
| 264 | // |
| 265 | // Starting from dense BFGS update detailed in Nocedal [2] p140/177 and |
| 266 | // using: y_k = delta_gradient, s_k = delta_x: |
| 267 | // |
| 268 | // \rho_k = 1.0 / (s_k' * y_k) |
| 269 | // V_k = I - \rho_k * y_k * s_k' |
| 270 | // H_k = (V_k' * H_{k-1} * V_k) + (\rho_k * s_k * s_k') |
| 271 | // |
| 272 | // This update involves matrix, matrix products which naively O(N^3), |
| 273 | // however we can exploit our knowledge that H_k is positive definite |
| 274 | // and thus by defn. symmetric to reduce the cost of the update: |
| 275 | // |
| 276 | // Expanding the update above yields: |
| 277 | // |
| 278 | // H_k = H_{k-1} + |
| 279 | // \rho_k * ( (1.0 + \rho_k * y_k' * H_k * y_k) * s_k * s_k' - |
| 280 | // (s_k * y_k' * H_k + H_k * y_k * s_k') ) |
| 281 | // |
| 282 | // Using: A = (s_k * y_k' * H_k), and the knowledge that H_k = H_k', the |
| 283 | // last term simplifies to (A + A'). Note that although A is not symmetric |
| 284 | // (A + A') is symmetric. For ease of construction we also define |
| 285 | // B = (1 + \rho_k * y_k' * H_k * y_k) * s_k * s_k', which is by defn |
| 286 | // symmetric due to construction from: s_k * s_k'. |
| 287 | // |
| 288 | // Now we can write the BFGS update as: |
| 289 | // |
| 290 | // H_k = H_{k-1} + \rho_k * (B - (A + A')) |
| 291 | |
| 292 | // For efficiency, as H_k is by defn. symmetric, we will only maintain the |
| 293 | // *lower* triangle of H_k (and all intermediary terms). |
| 294 | |
| 295 | const double rho_k = 1.0 / delta_x_dot_delta_gradient; |
| 296 | |
| 297 | // Calculate: A = s_k * y_k' * H_k |
| 298 | Matrix A = delta_x * (delta_gradient.transpose() * |
| 299 | inverse_hessian_.selfadjointView<Eigen::Lower>()); |
| 300 | |
| 301 | // Calculate scalar: (1 + \rho_k * y_k' * H_k * y_k) |
| 302 | const double delta_x_times_delta_x_transpose_scale_factor = |
| 303 | (1.0 + (rho_k * delta_gradient.transpose() * |
| 304 | inverse_hessian_.selfadjointView<Eigen::Lower>() * |
| 305 | delta_gradient)); |
| 306 | // Calculate: B = (1 + \rho_k * y_k' * H_k * y_k) * s_k * s_k' |
| 307 | Matrix B = Matrix::Zero(num_parameters_, num_parameters_); |
| 308 | B.selfadjointView<Eigen::Lower>(). |
| 309 | rankUpdate(delta_x, delta_x_times_delta_x_transpose_scale_factor); |
| 310 | |
| 311 | // Finally, update inverse Hessian approximation according to: |
| 312 | // H_k = H_{k-1} + \rho_k * (B - (A + A')). Note that (A + A') is |
| 313 | // symmetric, even though A is not. |
| 314 | inverse_hessian_.triangularView<Eigen::Lower>() += |
| 315 | rho_k * (B - A - A.transpose()); |
| 316 | } |
| 317 | |
| 318 | *search_direction = |
| 319 | inverse_hessian_.selfadjointView<Eigen::Lower>() * |
| 320 | (-1.0 * current.gradient); |
| 321 | |
| 322 | if (search_direction->dot(current.gradient) >= 0.0) { |
| 323 | LOG(WARNING) << "Numerical failure in BFGS update: inverse Hessian " |
| 324 | << "approximation is not positive definite, and thus " |
| 325 | << "initial gradient for search direction is positive: " |
| 326 | << search_direction->dot(current.gradient); |
| 327 | is_positive_definite_ = false; |
| 328 | return false; |
| 329 | } |
| 330 | |
| 331 | return true; |
| 332 | } |
| 333 | |
| 334 | private: |
| 335 | const int num_parameters_; |
| 336 | const bool use_approximate_eigenvalue_scaling_; |
| 337 | Matrix inverse_hessian_; |
| 338 | bool initialized_; |
| 339 | bool is_positive_definite_; |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 340 | }; |
| 341 | |
| 342 | LineSearchDirection* |
Sameer Agarwal | 509f68c | 2013-02-20 01:39:03 -0800 | [diff] [blame] | 343 | LineSearchDirection::Create(const LineSearchDirection::Options& options) { |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 344 | if (options.type == STEEPEST_DESCENT) { |
| 345 | return new SteepestDescent; |
| 346 | } |
| 347 | |
| 348 | if (options.type == NONLINEAR_CONJUGATE_GRADIENT) { |
| 349 | return new NonlinearConjugateGradient( |
| 350 | options.nonlinear_conjugate_gradient_type, |
| 351 | options.function_tolerance); |
| 352 | } |
| 353 | |
| 354 | if (options.type == ceres::LBFGS) { |
Alex Stewart | 9aa0e3c | 2013-07-05 20:22:37 +0100 | [diff] [blame] | 355 | return new ceres::internal::LBFGS( |
| 356 | options.num_parameters, |
| 357 | options.max_lbfgs_rank, |
| 358 | options.use_approximate_eigenvalue_bfgs_scaling); |
| 359 | } |
| 360 | |
| 361 | if (options.type == ceres::BFGS) { |
| 362 | return new ceres::internal::BFGS( |
| 363 | options.num_parameters, |
| 364 | options.use_approximate_eigenvalue_bfgs_scaling); |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 365 | } |
| 366 | |
| 367 | LOG(ERROR) << "Unknown line search direction type: " << options.type; |
| 368 | return NULL; |
| 369 | } |
| 370 | |
| 371 | } // namespace internal |
| 372 | } // namespace ceres |