| // Ceres Solver - A fast non-linear least squares minimizer |
| // Copyright 2015 Google Inc. All rights reserved. |
| // http://ceres-solver.org/ |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are met: |
| // |
| // * Redistributions of source code must retain the above copyright notice, |
| // this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above copyright notice, |
| // this list of conditions and the following disclaimer in the documentation |
| // and/or other materials provided with the distribution. |
| // * Neither the name of Google Inc. nor the names of its contributors may be |
| // used to endorse or promote products derived from this software without |
| // specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| // POSSIBILITY OF SUCH DAMAGE. |
| // |
| // Author: sameeragarwal@google.com (Sameer Agarwal) |
| |
| #include "ceres/line_search_direction.h" |
| |
| #include <memory> |
| |
| #include "ceres/internal/eigen.h" |
| #include "ceres/internal/export.h" |
| #include "ceres/line_search_minimizer.h" |
| #include "ceres/low_rank_inverse_hessian.h" |
| #include "glog/logging.h" |
| |
| namespace ceres::internal { |
| |
| class CERES_NO_EXPORT SteepestDescent final : public LineSearchDirection { |
| public: |
| bool NextDirection(const LineSearchMinimizer::State& previous, |
| const LineSearchMinimizer::State& current, |
| Vector* search_direction) override { |
| *search_direction = -current.gradient; |
| return true; |
| } |
| }; |
| |
| class CERES_NO_EXPORT NonlinearConjugateGradient final |
| : public LineSearchDirection { |
| public: |
| NonlinearConjugateGradient(const NonlinearConjugateGradientType type, |
| const double function_tolerance) |
| : type_(type), function_tolerance_(function_tolerance) {} |
| |
| bool NextDirection(const LineSearchMinimizer::State& previous, |
| const LineSearchMinimizer::State& current, |
| Vector* search_direction) override { |
| double beta = 0.0; |
| Vector gradient_change; |
| switch (type_) { |
| case FLETCHER_REEVES: |
| beta = current.gradient_squared_norm / previous.gradient_squared_norm; |
| break; |
| case POLAK_RIBIERE: |
| gradient_change = current.gradient - previous.gradient; |
| beta = (current.gradient.dot(gradient_change) / |
| previous.gradient_squared_norm); |
| break; |
| case HESTENES_STIEFEL: |
| gradient_change = current.gradient - previous.gradient; |
| beta = (current.gradient.dot(gradient_change) / |
| previous.search_direction.dot(gradient_change)); |
| break; |
| default: |
| LOG(FATAL) << "Unknown nonlinear conjugate gradient type: " << type_; |
| } |
| |
| *search_direction = -current.gradient + beta * previous.search_direction; |
| const double directional_derivative = |
| current.gradient.dot(*search_direction); |
| if (directional_derivative > -function_tolerance_) { |
| LOG(WARNING) << "Restarting non-linear conjugate gradients: " |
| << directional_derivative; |
| *search_direction = -current.gradient; |
| } |
| |
| return true; |
| } |
| |
| private: |
| const NonlinearConjugateGradientType type_; |
| const double function_tolerance_; |
| }; |
| |
| class CERES_NO_EXPORT LBFGS final : public LineSearchDirection { |
| public: |
| LBFGS(const int num_parameters, |
| const int max_lbfgs_rank, |
| const bool use_approximate_eigenvalue_bfgs_scaling) |
| : low_rank_inverse_hessian_(num_parameters, |
| max_lbfgs_rank, |
| use_approximate_eigenvalue_bfgs_scaling), |
| is_positive_definite_(true) {} |
| |
| bool NextDirection(const LineSearchMinimizer::State& previous, |
| const LineSearchMinimizer::State& current, |
| Vector* search_direction) override { |
| CHECK(is_positive_definite_) |
| << "Ceres bug: NextDirection() called on L-BFGS after inverse Hessian " |
| << "approximation has become indefinite, please contact the " |
| << "developers!"; |
| |
| low_rank_inverse_hessian_.Update( |
| previous.search_direction * previous.step_size, |
| current.gradient - previous.gradient); |
| |
| search_direction->setZero(); |
| low_rank_inverse_hessian_.RightMultiplyAndAccumulate( |
| current.gradient.data(), search_direction->data()); |
| *search_direction *= -1.0; |
| |
| if (search_direction->dot(current.gradient) >= 0.0) { |
| LOG(WARNING) << "Numerical failure in L-BFGS update: inverse Hessian " |
| << "approximation is not positive definite, and thus " |
| << "initial gradient for search direction is positive: " |
| << search_direction->dot(current.gradient); |
| is_positive_definite_ = false; |
| return false; |
| } |
| |
| return true; |
| } |
| |
| private: |
| LowRankInverseHessian low_rank_inverse_hessian_; |
| bool is_positive_definite_; |
| }; |
| |
| class CERES_NO_EXPORT BFGS final : public LineSearchDirection { |
| public: |
| BFGS(const int num_parameters, const bool use_approximate_eigenvalue_scaling) |
| : num_parameters_(num_parameters), |
| use_approximate_eigenvalue_scaling_(use_approximate_eigenvalue_scaling), |
| initialized_(false), |
| is_positive_definite_(true) { |
| if (num_parameters_ >= 1000) { |
| LOG(WARNING) << "BFGS line search being created with: " << num_parameters_ |
| << " parameters, this will allocate a dense approximate " |
| << "inverse Hessian of size: " << num_parameters_ << " x " |
| << num_parameters_ |
| << ", consider using the L-BFGS memory-efficient line " |
| << "search direction instead."; |
| } |
| // Construct inverse_hessian_ after logging warning about size s.t. if the |
| // allocation crashes us, the log will highlight what the issue likely was. |
| inverse_hessian_ = Matrix::Identity(num_parameters, num_parameters); |
| } |
| |
| bool NextDirection(const LineSearchMinimizer::State& previous, |
| const LineSearchMinimizer::State& current, |
| Vector* search_direction) override { |
| CHECK(is_positive_definite_) |
| << "Ceres bug: NextDirection() called on BFGS after inverse Hessian " |
| << "approximation has become indefinite, please contact the " |
| << "developers!"; |
| |
| const Vector delta_x = previous.search_direction * previous.step_size; |
| const Vector delta_gradient = current.gradient - previous.gradient; |
| const double delta_x_dot_delta_gradient = delta_x.dot(delta_gradient); |
| |
| // The (L)BFGS algorithm explicitly requires that the secant equation: |
| // |
| // B_{k+1} * s_k = y_k |
| // |
| // Is satisfied at each iteration, where B_{k+1} is the approximated |
| // Hessian at the k+1-th iteration, s_k = (x_{k+1} - x_{k}) and |
| // y_k = (grad_{k+1} - grad_{k}). As the approximated Hessian must be |
| // positive definite, this is equivalent to the condition: |
| // |
| // s_k^T * y_k > 0 [s_k^T * B_{k+1} * s_k = s_k^T * y_k > 0] |
| // |
| // This condition would always be satisfied if the function was strictly |
| // convex, alternatively, it is always satisfied provided that a Wolfe line |
| // search is used (even if the function is not strictly convex). See [1] |
| // (p138) for a proof. |
| // |
| // Although Ceres will always use a Wolfe line search when using (L)BFGS, |
| // practical implementation considerations mean that the line search |
| // may return a point that satisfies only the Armijo condition, and thus |
| // could violate the Secant equation. As such, we will only use a step |
| // to update the Hessian approximation if: |
| // |
| // s_k^T * y_k > tolerance |
| // |
| // It is important that tolerance is very small (and >=0), as otherwise we |
| // might skip the update too often and fail to capture important curvature |
| // information in the Hessian. For example going from 1e-10 -> 1e-14 |
| // improves the NIST benchmark score from 43/54 to 53/54. |
| // |
| // [1] Nocedal J, Wright S, Numerical Optimization, 2nd Ed. Springer, 1999. |
| // |
| // TODO(alexs.mac): Consider using Damped BFGS update instead of |
| // skipping update. |
| const double kBFGSSecantConditionHessianUpdateTolerance = 1e-14; |
| if (delta_x_dot_delta_gradient <= |
| kBFGSSecantConditionHessianUpdateTolerance) { |
| VLOG(2) << "Skipping BFGS Update, delta_x_dot_delta_gradient too " |
| << "small: " << delta_x_dot_delta_gradient |
| << ", tolerance: " << kBFGSSecantConditionHessianUpdateTolerance |
| << " (Secant condition)."; |
| } else { |
| // Update dense inverse Hessian approximation. |
| |
| if (!initialized_ && use_approximate_eigenvalue_scaling_) { |
| // Rescale the initial inverse Hessian approximation (H_0) to be |
| // iteratively updated so that it is of similar 'size' to the true |
| // inverse Hessian at the start point. As shown in [1]: |
| // |
| // \gamma = (delta_gradient_{0}' * delta_x_{0}) / |
| // (delta_gradient_{0}' * delta_gradient_{0}) |
| // |
| // Satisfies: |
| // |
| // (1 / \lambda_m) <= \gamma <= (1 / \lambda_1) |
| // |
| // Where \lambda_1 & \lambda_m are the smallest and largest eigenvalues |
| // of the true initial Hessian (not the inverse) respectively. Thus, |
| // \gamma is an approximate eigenvalue of the true inverse Hessian, and |
| // choosing: H_0 = I * \gamma will yield a starting point that has a |
| // similar scale to the true inverse Hessian. This technique is widely |
| // reported to often improve convergence, however this is not |
| // universally true, particularly if there are errors in the initial |
| // gradients, or if there are significant differences in the sensitivity |
| // of the problem to the parameters (i.e. the range of the magnitudes of |
| // the components of the gradient is large). |
| // |
| // The original origin of this rescaling trick is somewhat unclear, the |
| // earliest reference appears to be Oren [1], however it is widely |
| // discussed without specific attribution in various texts including |
| // [2] (p143). |
| // |
| // [1] Oren S.S., Self-scaling variable metric (SSVM) algorithms |
| // Part II: Implementation and experiments, Management Science, |
| // 20(5), 863-874, 1974. |
| // [2] Nocedal J., Wright S., Numerical Optimization, Springer, 1999. |
| const double approximate_eigenvalue_scale = |
| delta_x_dot_delta_gradient / delta_gradient.dot(delta_gradient); |
| inverse_hessian_ *= approximate_eigenvalue_scale; |
| |
| VLOG(4) << "Applying approximate_eigenvalue_scale: " |
| << approximate_eigenvalue_scale << " to initial inverse " |
| << "Hessian approximation."; |
| } |
| initialized_ = true; |
| |
| // Efficient O(num_parameters^2) BFGS update [2]. |
| // |
| // Starting from dense BFGS update detailed in Nocedal [2] p140/177 and |
| // using: y_k = delta_gradient, s_k = delta_x: |
| // |
| // \rho_k = 1.0 / (s_k' * y_k) |
| // V_k = I - \rho_k * y_k * s_k' |
| // H_k = (V_k' * H_{k-1} * V_k) + (\rho_k * s_k * s_k') |
| // |
| // This update involves matrix, matrix products which naively O(N^3), |
| // however we can exploit our knowledge that H_k is positive definite |
| // and thus by defn. symmetric to reduce the cost of the update: |
| // |
| // Expanding the update above yields: |
| // |
| // H_k = H_{k-1} + |
| // \rho_k * ( (1.0 + \rho_k * y_k' * H_k * y_k) * s_k * s_k' - |
| // (s_k * y_k' * H_k + H_k * y_k * s_k') ) |
| // |
| // Using: A = (s_k * y_k' * H_k), and the knowledge that H_k = H_k', the |
| // last term simplifies to (A + A'). Note that although A is not symmetric |
| // (A + A') is symmetric. For ease of construction we also define |
| // B = (1 + \rho_k * y_k' * H_k * y_k) * s_k * s_k', which is by defn |
| // symmetric due to construction from: s_k * s_k'. |
| // |
| // Now we can write the BFGS update as: |
| // |
| // H_k = H_{k-1} + \rho_k * (B - (A + A')) |
| |
| // For efficiency, as H_k is by defn. symmetric, we will only maintain the |
| // *lower* triangle of H_k (and all intermediary terms). |
| |
| const double rho_k = 1.0 / delta_x_dot_delta_gradient; |
| |
| // Calculate: A = s_k * y_k' * H_k |
| Matrix A = delta_x * (delta_gradient.transpose() * |
| inverse_hessian_.selfadjointView<Eigen::Lower>()); |
| |
| // Calculate scalar: (1 + \rho_k * y_k' * H_k * y_k) |
| const double delta_x_times_delta_x_transpose_scale_factor = |
| (1.0 + |
| (rho_k * delta_gradient.transpose() * |
| inverse_hessian_.selfadjointView<Eigen::Lower>() * delta_gradient)); |
| // Calculate: B = (1 + \rho_k * y_k' * H_k * y_k) * s_k * s_k' |
| Matrix B = Matrix::Zero(num_parameters_, num_parameters_); |
| B.selfadjointView<Eigen::Lower>().rankUpdate( |
| delta_x, delta_x_times_delta_x_transpose_scale_factor); |
| |
| // Finally, update inverse Hessian approximation according to: |
| // H_k = H_{k-1} + \rho_k * (B - (A + A')). Note that (A + A') is |
| // symmetric, even though A is not. |
| inverse_hessian_.triangularView<Eigen::Lower>() += |
| rho_k * (B - A - A.transpose()); |
| } |
| |
| *search_direction = inverse_hessian_.selfadjointView<Eigen::Lower>() * |
| (-1.0 * current.gradient); |
| |
| if (search_direction->dot(current.gradient) >= 0.0) { |
| LOG(WARNING) << "Numerical failure in BFGS update: inverse Hessian " |
| << "approximation is not positive definite, and thus " |
| << "initial gradient for search direction is positive: " |
| << search_direction->dot(current.gradient); |
| is_positive_definite_ = false; |
| return false; |
| } |
| |
| return true; |
| } |
| |
| private: |
| const int num_parameters_; |
| const bool use_approximate_eigenvalue_scaling_; |
| Matrix inverse_hessian_; |
| bool initialized_; |
| bool is_positive_definite_; |
| }; |
| |
| LineSearchDirection::~LineSearchDirection() = default; |
| |
| std::unique_ptr<LineSearchDirection> LineSearchDirection::Create( |
| const LineSearchDirection::Options& options) { |
| if (options.type == STEEPEST_DESCENT) { |
| return std::make_unique<SteepestDescent>(); |
| } |
| |
| if (options.type == NONLINEAR_CONJUGATE_GRADIENT) { |
| return std::make_unique<NonlinearConjugateGradient>( |
| options.nonlinear_conjugate_gradient_type, options.function_tolerance); |
| } |
| |
| if (options.type == ceres::LBFGS) { |
| return std::make_unique<ceres::internal::LBFGS>( |
| options.num_parameters, |
| options.max_lbfgs_rank, |
| options.use_approximate_eigenvalue_bfgs_scaling); |
| } |
| |
| if (options.type == ceres::BFGS) { |
| return std::make_unique<ceres::internal::BFGS>( |
| options.num_parameters, |
| options.use_approximate_eigenvalue_bfgs_scaling); |
| } |
| |
| LOG(ERROR) << "Unknown line search direction type: " << options.type; |
| return nullptr; |
| } |
| |
| } // namespace ceres::internal |