Decreasing update threshold for BFGS as per L-BFGS.
- Improves performance of BFGS on NIST, as per L-BFGS.
- Adding explanation of origin and purpose of Secant condition
tolerance check for Hessian update in (L)BFGS.
Change-Id: If57b9957d31d8629c772c19a069e1e56e727b350
diff --git a/internal/ceres/line_search_direction.cc b/internal/ceres/line_search_direction.cc
index 8ded823..5d91215 100644
--- a/internal/ceres/line_search_direction.cc
+++ b/internal/ceres/line_search_direction.cc
@@ -121,6 +121,7 @@
low_rank_inverse_hessian_.Update(
previous.search_direction * previous.step_size,
current.gradient - previous.gradient);
+
search_direction->setZero();
low_rank_inverse_hessian_.RightMultiply(current.gradient.data(),
search_direction->data());
@@ -176,9 +177,45 @@
const Vector delta_gradient = current.gradient - previous.gradient;
const double delta_x_dot_delta_gradient = delta_x.dot(delta_gradient);
- if (delta_x_dot_delta_gradient <= 1e-10) {
- VLOG(2) << "Skipping BFGS Update, delta_x_dot_delta_gradient too "
- << "small: " << delta_x_dot_delta_gradient;
+ // The (L)BFGS algorithm explicitly requires that the secant equation:
+ //
+ // B_{k+1} * s_k = y_k
+ //
+ // Is satisfied at each iteration, where B_{k+1} is the approximated
+ // Hessian at the k+1-th iteration, s_k = (x_{k+1} - x_{k}) and
+ // y_k = (grad_{k+1} - grad_{k}). As the approximated Hessian must be
+ // positive definite, this is equivalent to the condition:
+ //
+ // s_k^T * y_k > 0 [s_k^T * B_{k+1} * s_k = s_k^T * y_k > 0]
+ //
+ // This condition would always be satisfied if the function was strictly
+ // convex, alternatively, it is always satisfied provided that a Wolfe line
+ // search is used (even if the function is not strictly convex). See [1]
+ // (p138) for a proof.
+ //
+ // Although Ceres will always use a Wolfe line search when using (L)BFGS,
+ // practical implementation considerations mean that the line search
+ // may return a point that satisfies only the Armijo condition, and thus
+ // could violate the Secant equation. As such, we will only use a step
+ // to update the Hessian approximation if:
+ //
+ // s_k^T * y_k > tolerance
+ //
+ // It is important that tolerance is very small (and >=0), as otherwise we
+ // might skip the update too often and fail to capture important curvature
+ // information in the Hessian. For example going from 1e-10 -> 1e-14
+ // improves the NIST benchmark score from 43/54 to 53/54.
+ //
+ // [1] Nocedal J, Wright S, Numerical Optimization, 2nd Ed. Springer, 1999.
+ //
+ // TODO: Consider using Damped BFGS update instead of skipping update.
+ const double kBFGSSecantConditionHessianUpdateTolerance = 1e-14;
+ if (delta_x_dot_delta_gradient <=
+ kBFGSSecantConditionHessianUpdateTolerance) {
+ LOG(WARNING) << "Skipping BFGS Update, delta_x_dot_delta_gradient too "
+ << "small: " << delta_x_dot_delta_gradient << ", tolerance: "
+ << kBFGSSecantConditionHessianUpdateTolerance
+ << " (Secant condition).";
} else {
// Update dense inverse Hessian approximation.
diff --git a/internal/ceres/low_rank_inverse_hessian.cc b/internal/ceres/low_rank_inverse_hessian.cc
index 0bb71dc..6a92528 100644
--- a/internal/ceres/low_rank_inverse_hessian.cc
+++ b/internal/ceres/low_rank_inverse_hessian.cc
@@ -35,6 +35,40 @@
namespace ceres {
namespace internal {
+// The (L)BFGS algorithm explicitly requires that the secant equation:
+//
+// B_{k+1} * s_k = y_k
+//
+// Is satisfied at each iteration, where B_{k+1} is the approximated
+// Hessian at the k+1-th iteration, s_k = (x_{k+1} - x_{k}) and
+// y_k = (grad_{k+1} - grad_{k}). As the approximated Hessian must be
+// positive definite, this is equivalent to the condition:
+//
+// s_k^T * y_k > 0 [s_k^T * B_{k+1} * s_k = s_k^T * y_k > 0]
+//
+// This condition would always be satisfied if the function was strictly
+// convex, alternatively, it is always satisfied provided that a Wolfe line
+// search is used (even if the function is not strictly convex). See [1]
+// (p138) for a proof.
+//
+// Although Ceres will always use a Wolfe line search when using (L)BFGS,
+// practical implementation considerations mean that the line search
+// may return a point that satisfies only the Armijo condition, and thus
+// could violate the Secant equation. As such, we will only use a step
+// to update the Hessian approximation if:
+//
+// s_k^T * y_k > tolerance
+//
+// It is important that tolerance is very small (and >=0), as otherwise we
+// might skip the update too often and fail to capture important curvature
+// information in the Hessian. For example going from 1e-10 -> 1e-14 improves
+// the NIST benchmark score from 43/54 to 53/54.
+//
+// [1] Nocedal J., Wright S., Numerical Optimization, 2nd Ed. Springer, 1999.
+//
+// TODO: Consider using Damped BFGS update instead of skipping update.
+const double kLBFGSSecantConditionHessianUpdateTolerance = 1e-14;
+
LowRankInverseHessian::LowRankInverseHessian(
int num_parameters,
int max_num_corrections,
@@ -52,11 +86,12 @@
bool LowRankInverseHessian::Update(const Vector& delta_x,
const Vector& delta_gradient) {
const double delta_x_dot_delta_gradient = delta_x.dot(delta_gradient);
- // Note that 1e-14 is very small, but larger values (1e-10/12) substantially
- // weaken the performance on the NIST benchmark suite.
- if (delta_x_dot_delta_gradient <= 1e-14) {
- VLOG(2) << "Skipping LBFGS Update, delta_x_dot_delta_gradient too small: "
- << delta_x_dot_delta_gradient;
+ if (delta_x_dot_delta_gradient <=
+ kLBFGSSecantConditionHessianUpdateTolerance) {
+ LOG(WARNING) << "Skipping L-BFGS Update, delta_x_dot_delta_gradient too "
+ << "small: " << delta_x_dot_delta_gradient << ", tolerance: "
+ << kLBFGSSecantConditionHessianUpdateTolerance
+ << " (Secant condition).";
return false;
}