Decreasing update threshold for BFGS as per L-BFGS. - Improves performance of BFGS on NIST, as per L-BFGS. - Adding explanation of origin and purpose of Secant condition tolerance check for Hessian update in (L)BFGS. Change-Id: If57b9957d31d8629c772c19a069e1e56e727b350
diff --git a/internal/ceres/line_search_direction.cc b/internal/ceres/line_search_direction.cc index 8ded823..5d91215 100644 --- a/internal/ceres/line_search_direction.cc +++ b/internal/ceres/line_search_direction.cc
@@ -121,6 +121,7 @@ low_rank_inverse_hessian_.Update( previous.search_direction * previous.step_size, current.gradient - previous.gradient); + search_direction->setZero(); low_rank_inverse_hessian_.RightMultiply(current.gradient.data(), search_direction->data()); @@ -176,9 +177,45 @@ const Vector delta_gradient = current.gradient - previous.gradient; const double delta_x_dot_delta_gradient = delta_x.dot(delta_gradient); - if (delta_x_dot_delta_gradient <= 1e-10) { - VLOG(2) << "Skipping BFGS Update, delta_x_dot_delta_gradient too " - << "small: " << delta_x_dot_delta_gradient; + // The (L)BFGS algorithm explicitly requires that the secant equation: + // + // B_{k+1} * s_k = y_k + // + // Is satisfied at each iteration, where B_{k+1} is the approximated + // Hessian at the k+1-th iteration, s_k = (x_{k+1} - x_{k}) and + // y_k = (grad_{k+1} - grad_{k}). As the approximated Hessian must be + // positive definite, this is equivalent to the condition: + // + // s_k^T * y_k > 0 [s_k^T * B_{k+1} * s_k = s_k^T * y_k > 0] + // + // This condition would always be satisfied if the function was strictly + // convex, alternatively, it is always satisfied provided that a Wolfe line + // search is used (even if the function is not strictly convex). See [1] + // (p138) for a proof. + // + // Although Ceres will always use a Wolfe line search when using (L)BFGS, + // practical implementation considerations mean that the line search + // may return a point that satisfies only the Armijo condition, and thus + // could violate the Secant equation. As such, we will only use a step + // to update the Hessian approximation if: + // + // s_k^T * y_k > tolerance + // + // It is important that tolerance is very small (and >=0), as otherwise we + // might skip the update too often and fail to capture important curvature + // information in the Hessian. For example going from 1e-10 -> 1e-14 + // improves the NIST benchmark score from 43/54 to 53/54. + // + // [1] Nocedal J, Wright S, Numerical Optimization, 2nd Ed. Springer, 1999. + // + // TODO: Consider using Damped BFGS update instead of skipping update. + const double kBFGSSecantConditionHessianUpdateTolerance = 1e-14; + if (delta_x_dot_delta_gradient <= + kBFGSSecantConditionHessianUpdateTolerance) { + LOG(WARNING) << "Skipping BFGS Update, delta_x_dot_delta_gradient too " + << "small: " << delta_x_dot_delta_gradient << ", tolerance: " + << kBFGSSecantConditionHessianUpdateTolerance + << " (Secant condition)."; } else { // Update dense inverse Hessian approximation.
diff --git a/internal/ceres/low_rank_inverse_hessian.cc b/internal/ceres/low_rank_inverse_hessian.cc index 0bb71dc..6a92528 100644 --- a/internal/ceres/low_rank_inverse_hessian.cc +++ b/internal/ceres/low_rank_inverse_hessian.cc
@@ -35,6 +35,40 @@ namespace ceres { namespace internal { +// The (L)BFGS algorithm explicitly requires that the secant equation: +// +// B_{k+1} * s_k = y_k +// +// Is satisfied at each iteration, where B_{k+1} is the approximated +// Hessian at the k+1-th iteration, s_k = (x_{k+1} - x_{k}) and +// y_k = (grad_{k+1} - grad_{k}). As the approximated Hessian must be +// positive definite, this is equivalent to the condition: +// +// s_k^T * y_k > 0 [s_k^T * B_{k+1} * s_k = s_k^T * y_k > 0] +// +// This condition would always be satisfied if the function was strictly +// convex, alternatively, it is always satisfied provided that a Wolfe line +// search is used (even if the function is not strictly convex). See [1] +// (p138) for a proof. +// +// Although Ceres will always use a Wolfe line search when using (L)BFGS, +// practical implementation considerations mean that the line search +// may return a point that satisfies only the Armijo condition, and thus +// could violate the Secant equation. As such, we will only use a step +// to update the Hessian approximation if: +// +// s_k^T * y_k > tolerance +// +// It is important that tolerance is very small (and >=0), as otherwise we +// might skip the update too often and fail to capture important curvature +// information in the Hessian. For example going from 1e-10 -> 1e-14 improves +// the NIST benchmark score from 43/54 to 53/54. +// +// [1] Nocedal J., Wright S., Numerical Optimization, 2nd Ed. Springer, 1999. +// +// TODO: Consider using Damped BFGS update instead of skipping update. +const double kLBFGSSecantConditionHessianUpdateTolerance = 1e-14; + LowRankInverseHessian::LowRankInverseHessian( int num_parameters, int max_num_corrections, @@ -52,11 +86,12 @@ bool LowRankInverseHessian::Update(const Vector& delta_x, const Vector& delta_gradient) { const double delta_x_dot_delta_gradient = delta_x.dot(delta_gradient); - // Note that 1e-14 is very small, but larger values (1e-10/12) substantially - // weaken the performance on the NIST benchmark suite. - if (delta_x_dot_delta_gradient <= 1e-14) { - VLOG(2) << "Skipping LBFGS Update, delta_x_dot_delta_gradient too small: " - << delta_x_dot_delta_gradient; + if (delta_x_dot_delta_gradient <= + kLBFGSSecantConditionHessianUpdateTolerance) { + LOG(WARNING) << "Skipping L-BFGS Update, delta_x_dot_delta_gradient too " + << "small: " << delta_x_dot_delta_gradient << ", tolerance: " + << kLBFGSSecantConditionHessianUpdateTolerance + << " (Secant condition)."; return false; }