Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 1 | // Ceres Solver - A fast non-linear least squares minimizer |
| 2 | // Copyright 2012 Google Inc. All rights reserved. |
| 3 | // http://code.google.com/p/ceres-solver/ |
| 4 | // |
| 5 | // Redistribution and use in source and binary forms, with or without |
| 6 | // modification, are permitted provided that the following conditions are met: |
| 7 | // |
| 8 | // * Redistributions of source code must retain the above copyright notice, |
| 9 | // this list of conditions and the following disclaimer. |
| 10 | // * Redistributions in binary form must reproduce the above copyright notice, |
| 11 | // this list of conditions and the following disclaimer in the documentation |
| 12 | // and/or other materials provided with the distribution. |
| 13 | // * Neither the name of Google Inc. nor the names of its contributors may be |
| 14 | // used to endorse or promote products derived from this software without |
| 15 | // specific prior written permission. |
| 16 | // |
| 17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 18 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 19 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 20 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 21 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 22 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 23 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 24 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 25 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 26 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 27 | // POSSIBILITY OF SUCH DAMAGE. |
| 28 | // |
| 29 | // Author: sameeragarwal@google.com (Sameer Agarwal) |
| 30 | |
Sameer Agarwal | 8140f0f | 2013-03-12 09:45:08 -0700 | [diff] [blame] | 31 | #ifndef CERES_NO_LINE_SEARCH_MINIMIZER |
| 32 | |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 33 | #include "ceres/line_search_direction.h" |
| 34 | #include "ceres/line_search_minimizer.h" |
| 35 | #include "ceres/low_rank_inverse_hessian.h" |
| 36 | #include "ceres/internal/eigen.h" |
| 37 | #include "glog/logging.h" |
| 38 | |
| 39 | namespace ceres { |
| 40 | namespace internal { |
| 41 | |
| 42 | class SteepestDescent : public LineSearchDirection { |
| 43 | public: |
| 44 | virtual ~SteepestDescent() {} |
| 45 | bool NextDirection(const LineSearchMinimizer::State& previous, |
| 46 | const LineSearchMinimizer::State& current, |
| 47 | Vector* search_direction) { |
| 48 | *search_direction = -current.gradient; |
| 49 | return true; |
| 50 | } |
| 51 | }; |
| 52 | |
| 53 | class NonlinearConjugateGradient : public LineSearchDirection { |
| 54 | public: |
| 55 | NonlinearConjugateGradient(const NonlinearConjugateGradientType type, |
| 56 | const double function_tolerance) |
| 57 | : type_(type), |
| 58 | function_tolerance_(function_tolerance) { |
| 59 | } |
| 60 | |
| 61 | bool NextDirection(const LineSearchMinimizer::State& previous, |
| 62 | const LineSearchMinimizer::State& current, |
| 63 | Vector* search_direction) { |
| 64 | double beta = 0.0; |
| 65 | Vector gradient_change; |
| 66 | switch (type_) { |
| 67 | case FLETCHER_REEVES: |
| 68 | beta = current.gradient_squared_norm / previous.gradient_squared_norm; |
| 69 | break; |
| 70 | case POLAK_RIBIRERE: |
| 71 | gradient_change = current.gradient - previous.gradient; |
| 72 | beta = (current.gradient.dot(gradient_change) / |
| 73 | previous.gradient_squared_norm); |
| 74 | break; |
| 75 | case HESTENES_STIEFEL: |
| 76 | gradient_change = current.gradient - previous.gradient; |
| 77 | beta = (current.gradient.dot(gradient_change) / |
| 78 | previous.search_direction.dot(gradient_change)); |
| 79 | break; |
| 80 | default: |
| 81 | LOG(FATAL) << "Unknown nonlinear conjugate gradient type: " << type_; |
| 82 | } |
| 83 | |
| 84 | *search_direction = -current.gradient + beta * previous.search_direction; |
Sameer Agarwal | 509f68c | 2013-02-20 01:39:03 -0800 | [diff] [blame] | 85 | const double directional_derivative = |
Sameer Agarwal | 931c309 | 2013-02-25 09:46:21 -0800 | [diff] [blame] | 86 | current.gradient.dot(*search_direction); |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 87 | if (directional_derivative > -function_tolerance_) { |
| 88 | LOG(WARNING) << "Restarting non-linear conjugate gradients: " |
| 89 | << directional_derivative; |
| 90 | *search_direction = -current.gradient; |
| 91 | }; |
| 92 | |
| 93 | return true; |
| 94 | } |
| 95 | |
| 96 | private: |
| 97 | const NonlinearConjugateGradientType type_; |
| 98 | const double function_tolerance_; |
| 99 | }; |
| 100 | |
| 101 | class LBFGS : public LineSearchDirection { |
| 102 | public: |
Alex Stewart | 9aa0e3c | 2013-07-05 20:22:37 +0100 | [diff] [blame] | 103 | LBFGS(const int num_parameters, |
| 104 | const int max_lbfgs_rank, |
| 105 | const bool use_approximate_eigenvalue_bfgs_scaling) |
| 106 | : low_rank_inverse_hessian_(num_parameters, |
| 107 | max_lbfgs_rank, |
| 108 | use_approximate_eigenvalue_bfgs_scaling), |
| 109 | is_positive_definite_(true) {} |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 110 | |
| 111 | virtual ~LBFGS() {} |
| 112 | |
| 113 | bool NextDirection(const LineSearchMinimizer::State& previous, |
| 114 | const LineSearchMinimizer::State& current, |
| 115 | Vector* search_direction) { |
Alex Stewart | 9aa0e3c | 2013-07-05 20:22:37 +0100 | [diff] [blame] | 116 | CHECK(is_positive_definite_) |
| 117 | << "Ceres bug: NextDirection() called on L-BFGS after inverse Hessian " |
| 118 | << "approximation has become indefinite, please contact the " |
| 119 | << "developers!"; |
| 120 | |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 121 | low_rank_inverse_hessian_.Update( |
| 122 | previous.search_direction * previous.step_size, |
| 123 | current.gradient - previous.gradient); |
| 124 | search_direction->setZero(); |
| 125 | low_rank_inverse_hessian_.RightMultiply(current.gradient.data(), |
| 126 | search_direction->data()); |
| 127 | *search_direction *= -1.0; |
Alex Stewart | 9aa0e3c | 2013-07-05 20:22:37 +0100 | [diff] [blame] | 128 | |
| 129 | if (search_direction->dot(current.gradient) >= 0.0) { |
| 130 | LOG(WARNING) << "Numerical failure in L-BFGS update: inverse Hessian " |
| 131 | << "approximation is not positive definite, and thus " |
| 132 | << "initial gradient for search direction is positive: " |
| 133 | << search_direction->dot(current.gradient); |
| 134 | is_positive_definite_ = false; |
| 135 | return false; |
| 136 | } |
| 137 | |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 138 | return true; |
| 139 | } |
| 140 | |
| 141 | private: |
| 142 | LowRankInverseHessian low_rank_inverse_hessian_; |
Alex Stewart | 9aa0e3c | 2013-07-05 20:22:37 +0100 | [diff] [blame] | 143 | bool is_positive_definite_; |
| 144 | }; |
| 145 | |
| 146 | class BFGS : public LineSearchDirection { |
| 147 | public: |
| 148 | BFGS(const int num_parameters, |
| 149 | const bool use_approximate_eigenvalue_scaling) |
| 150 | : num_parameters_(num_parameters), |
| 151 | use_approximate_eigenvalue_scaling_(use_approximate_eigenvalue_scaling), |
| 152 | initialized_(false), |
| 153 | is_positive_definite_(true) { |
| 154 | LOG_IF(WARNING, num_parameters_ >= 1e3) |
| 155 | << "BFGS line search being created with: " << num_parameters_ |
| 156 | << " parameters, this will allocate a dense approximate inverse Hessian" |
| 157 | << " of size: " << num_parameters_ << " x " << num_parameters_ |
| 158 | << ", consider using the L-BFGS memory-efficient line search direction " |
| 159 | << "instead."; |
| 160 | // Construct inverse_hessian_ after logging warning about size s.t. if the |
| 161 | // allocation crashes us, the log will highlight what the issue likely was. |
| 162 | inverse_hessian_ = Matrix::Identity(num_parameters, num_parameters); |
| 163 | } |
| 164 | |
| 165 | virtual ~BFGS() {} |
| 166 | |
| 167 | bool NextDirection(const LineSearchMinimizer::State& previous, |
| 168 | const LineSearchMinimizer::State& current, |
| 169 | Vector* search_direction) { |
| 170 | CHECK(is_positive_definite_) |
| 171 | << "Ceres bug: NextDirection() called on BFGS after inverse Hessian " |
| 172 | << "approximation has become indefinite, please contact the " |
| 173 | << "developers!"; |
| 174 | |
| 175 | const Vector delta_x = previous.search_direction * previous.step_size; |
| 176 | const Vector delta_gradient = current.gradient - previous.gradient; |
| 177 | const double delta_x_dot_delta_gradient = delta_x.dot(delta_gradient); |
| 178 | |
| 179 | if (delta_x_dot_delta_gradient <= 1e-10) { |
| 180 | VLOG(2) << "Skipping BFGS Update, delta_x_dot_delta_gradient too " |
| 181 | << "small: " << delta_x_dot_delta_gradient; |
| 182 | } else { |
| 183 | // Update dense inverse Hessian approximation. |
| 184 | |
| 185 | if (!initialized_ && use_approximate_eigenvalue_scaling_) { |
| 186 | // Rescale the initial inverse Hessian approximation (H_0) to be |
| 187 | // iteratively updated so that it is of similar 'size' to the true |
| 188 | // inverse Hessian at the start point. As shown in [1]: |
| 189 | // |
| 190 | // \gamma = (delta_gradient_{0}' * delta_x_{0}) / |
| 191 | // (delta_gradient_{0}' * delta_gradient_{0}) |
| 192 | // |
| 193 | // Satisfies: |
| 194 | // |
| 195 | // (1 / \lambda_m) <= \gamma <= (1 / \lambda_1) |
| 196 | // |
| 197 | // Where \lambda_1 & \lambda_m are the smallest and largest eigenvalues |
| 198 | // of the true initial Hessian (not the inverse) respectively. Thus, |
| 199 | // \gamma is an approximate eigenvalue of the true inverse Hessian, and |
| 200 | // choosing: H_0 = I * \gamma will yield a starting point that has a |
| 201 | // similar scale to the true inverse Hessian. This technique is widely |
| 202 | // reported to often improve convergence, however this is not |
| 203 | // universally true, particularly if there are errors in the initial |
| 204 | // gradients, or if there are significant differences in the sensitivity |
| 205 | // of the problem to the parameters (i.e. the range of the magnitudes of |
| 206 | // the components of the gradient is large). |
| 207 | // |
| 208 | // The original origin of this rescaling trick is somewhat unclear, the |
| 209 | // earliest reference appears to be Oren [1], however it is widely |
| 210 | // discussed without specific attributation in various texts including |
| 211 | // [2] (p143). |
| 212 | // |
| 213 | // [1] Oren S.S., Self-scaling variable metric (SSVM) algorithms |
| 214 | // Part II: Implementation and experiments, Management Science, |
| 215 | // 20(5), 863-874, 1974. |
| 216 | // [2] Nocedal J., Wright S., Numerical Optimization, Springer, 1999. |
| 217 | inverse_hessian_ *= |
| 218 | delta_x_dot_delta_gradient / delta_gradient.dot(delta_gradient); |
| 219 | } |
| 220 | initialized_ = true; |
| 221 | |
| 222 | // Efficient O(num_parameters^2) BFGS update [2]. |
| 223 | // |
| 224 | // Starting from dense BFGS update detailed in Nocedal [2] p140/177 and |
| 225 | // using: y_k = delta_gradient, s_k = delta_x: |
| 226 | // |
| 227 | // \rho_k = 1.0 / (s_k' * y_k) |
| 228 | // V_k = I - \rho_k * y_k * s_k' |
| 229 | // H_k = (V_k' * H_{k-1} * V_k) + (\rho_k * s_k * s_k') |
| 230 | // |
| 231 | // This update involves matrix, matrix products which naively O(N^3), |
| 232 | // however we can exploit our knowledge that H_k is positive definite |
| 233 | // and thus by defn. symmetric to reduce the cost of the update: |
| 234 | // |
| 235 | // Expanding the update above yields: |
| 236 | // |
| 237 | // H_k = H_{k-1} + |
| 238 | // \rho_k * ( (1.0 + \rho_k * y_k' * H_k * y_k) * s_k * s_k' - |
| 239 | // (s_k * y_k' * H_k + H_k * y_k * s_k') ) |
| 240 | // |
| 241 | // Using: A = (s_k * y_k' * H_k), and the knowledge that H_k = H_k', the |
| 242 | // last term simplifies to (A + A'). Note that although A is not symmetric |
| 243 | // (A + A') is symmetric. For ease of construction we also define |
| 244 | // B = (1 + \rho_k * y_k' * H_k * y_k) * s_k * s_k', which is by defn |
| 245 | // symmetric due to construction from: s_k * s_k'. |
| 246 | // |
| 247 | // Now we can write the BFGS update as: |
| 248 | // |
| 249 | // H_k = H_{k-1} + \rho_k * (B - (A + A')) |
| 250 | |
| 251 | // For efficiency, as H_k is by defn. symmetric, we will only maintain the |
| 252 | // *lower* triangle of H_k (and all intermediary terms). |
| 253 | |
| 254 | const double rho_k = 1.0 / delta_x_dot_delta_gradient; |
| 255 | |
| 256 | // Calculate: A = s_k * y_k' * H_k |
| 257 | Matrix A = delta_x * (delta_gradient.transpose() * |
| 258 | inverse_hessian_.selfadjointView<Eigen::Lower>()); |
| 259 | |
| 260 | // Calculate scalar: (1 + \rho_k * y_k' * H_k * y_k) |
| 261 | const double delta_x_times_delta_x_transpose_scale_factor = |
| 262 | (1.0 + (rho_k * delta_gradient.transpose() * |
| 263 | inverse_hessian_.selfadjointView<Eigen::Lower>() * |
| 264 | delta_gradient)); |
| 265 | // Calculate: B = (1 + \rho_k * y_k' * H_k * y_k) * s_k * s_k' |
| 266 | Matrix B = Matrix::Zero(num_parameters_, num_parameters_); |
| 267 | B.selfadjointView<Eigen::Lower>(). |
| 268 | rankUpdate(delta_x, delta_x_times_delta_x_transpose_scale_factor); |
| 269 | |
| 270 | // Finally, update inverse Hessian approximation according to: |
| 271 | // H_k = H_{k-1} + \rho_k * (B - (A + A')). Note that (A + A') is |
| 272 | // symmetric, even though A is not. |
| 273 | inverse_hessian_.triangularView<Eigen::Lower>() += |
| 274 | rho_k * (B - A - A.transpose()); |
| 275 | } |
| 276 | |
| 277 | *search_direction = |
| 278 | inverse_hessian_.selfadjointView<Eigen::Lower>() * |
| 279 | (-1.0 * current.gradient); |
| 280 | |
| 281 | if (search_direction->dot(current.gradient) >= 0.0) { |
| 282 | LOG(WARNING) << "Numerical failure in BFGS update: inverse Hessian " |
| 283 | << "approximation is not positive definite, and thus " |
| 284 | << "initial gradient for search direction is positive: " |
| 285 | << search_direction->dot(current.gradient); |
| 286 | is_positive_definite_ = false; |
| 287 | return false; |
| 288 | } |
| 289 | |
| 290 | return true; |
| 291 | } |
| 292 | |
| 293 | private: |
| 294 | const int num_parameters_; |
| 295 | const bool use_approximate_eigenvalue_scaling_; |
| 296 | Matrix inverse_hessian_; |
| 297 | bool initialized_; |
| 298 | bool is_positive_definite_; |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 299 | }; |
| 300 | |
| 301 | LineSearchDirection* |
Sameer Agarwal | 509f68c | 2013-02-20 01:39:03 -0800 | [diff] [blame] | 302 | LineSearchDirection::Create(const LineSearchDirection::Options& options) { |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 303 | if (options.type == STEEPEST_DESCENT) { |
| 304 | return new SteepestDescent; |
| 305 | } |
| 306 | |
| 307 | if (options.type == NONLINEAR_CONJUGATE_GRADIENT) { |
| 308 | return new NonlinearConjugateGradient( |
| 309 | options.nonlinear_conjugate_gradient_type, |
| 310 | options.function_tolerance); |
| 311 | } |
| 312 | |
| 313 | if (options.type == ceres::LBFGS) { |
Alex Stewart | 9aa0e3c | 2013-07-05 20:22:37 +0100 | [diff] [blame] | 314 | return new ceres::internal::LBFGS( |
| 315 | options.num_parameters, |
| 316 | options.max_lbfgs_rank, |
| 317 | options.use_approximate_eigenvalue_bfgs_scaling); |
| 318 | } |
| 319 | |
| 320 | if (options.type == ceres::BFGS) { |
| 321 | return new ceres::internal::BFGS( |
| 322 | options.num_parameters, |
| 323 | options.use_approximate_eigenvalue_bfgs_scaling); |
Sameer Agarwal | 9883fc3 | 2012-11-30 12:32:43 -0800 | [diff] [blame] | 324 | } |
| 325 | |
| 326 | LOG(ERROR) << "Unknown line search direction type: " << options.type; |
| 327 | return NULL; |
| 328 | } |
| 329 | |
| 330 | } // namespace internal |
| 331 | } // namespace ceres |
Sameer Agarwal | 8140f0f | 2013-03-12 09:45:08 -0700 | [diff] [blame] | 332 | |
| 333 | #endif // CERES_NO_LINE_SEARCH_MINIMIZER |