Adding Wolfe line search algorithm and full BFGS search direction options.
Change-Id: I9d3fb117805bdfa5bc33613368f45ae8f10e0d79
diff --git a/include/ceres/iteration_callback.h b/include/ceres/iteration_callback.h
index 13140c2..987c2d9 100644
--- a/include/ceres/iteration_callback.h
+++ b/include/ceres/iteration_callback.h
@@ -54,6 +54,8 @@
eta(0.0),
step_size(0.0),
line_search_function_evaluations(0),
+ line_search_gradient_evaluations(0),
+ line_search_iterations(0),
linear_solver_iterations(0),
iteration_time_in_seconds(0.0),
step_solver_time_in_seconds(0.0),
@@ -121,9 +123,15 @@
// Step sized computed by the line search algorithm.
double step_size;
- // Number of function evaluations used by the line search algorithm.
+ // Number of function value evaluations used by the line search algorithm.
int line_search_function_evaluations;
+ // Number of function gradient evaluations used by the line search algorithm.
+ int line_search_gradient_evaluations;
+
+ // Number of iterations taken by the line search algorithm.
+ int line_search_iterations;
+
// Number of iterations taken by the linear solver to solve for the
// Newton step.
int linear_solver_iterations;
diff --git a/include/ceres/solver.h b/include/ceres/solver.h
index 47c7e94..e7b8d09 100644
--- a/include/ceres/solver.h
+++ b/include/ceres/solver.h
@@ -60,14 +60,19 @@
Options() {
minimizer_type = TRUST_REGION;
line_search_direction_type = LBFGS;
- line_search_type = ARMIJO;
+ line_search_type = WOLFE;
nonlinear_conjugate_gradient_type = FLETCHER_REEVES;
max_lbfgs_rank = 20;
+ use_approximate_eigenvalue_bfgs_scaling = false;
line_search_interpolation_type = CUBIC;
min_line_search_step_size = 1e-9;
- armijo_sufficient_decrease = 1e-4;
- min_armijo_relative_step_size_change = 1e-3;
- max_armijo_relative_step_size_change = 0.6;
+ line_search_sufficient_function_decrease = 1e-4;
+ max_line_search_step_contraction = 1e-3;
+ min_line_search_step_contraction = 0.6;
+ max_num_line_search_step_size_iterations = 20;
+ max_num_line_search_direction_restarts = 5;
+ line_search_sufficient_curvature_decrease = 0.9;
+ max_line_search_step_expansion = 10.0;
trust_region_strategy_type = LEVENBERG_MARQUARDT;
dogleg_type = TRADITIONAL_DOGLEG;
@@ -178,6 +183,28 @@
// Limited Storage". Mathematics of Computation 35 (151): 773–782.
int max_lbfgs_rank;
+ // As part of the (L)BFGS update step (BFGS) / right-multiply step (L-BFGS),
+ // the initial inverse Hessian approximation is taken to be the Identity.
+ // However, Oren showed that using instead I * \gamma, where \gamma is
+ // chosen to approximate an eigenvalue of the true inverse Hessian can
+ // result in improved convergence in a wide variety of cases. Setting
+ // use_approximate_eigenvalue_bfgs_scaling to true enables this scaling.
+ //
+ // It is important to note that approximate eigenvalue scaling does not
+ // always improve convergence, and that it can in fact significantly degrade
+ // performance for certain classes of problem, which is why it is disabled
+ // by default. In particular it can degrade performance when the
+ // sensitivity of the problem to different parameters varies significantly,
+ // as in this case a single scalar factor fails to capture this variation
+ // and detrimentally downscales parts of the jacobian approximation which
+ // correspond to low-sensitivity parameters. It can also reduce the
+ // robustness of the solution to errors in the jacobians.
+ //
+ // Oren S.S., Self-scaling variable metric (SSVM) algorithms
+ // Part II: Implementation and experiments, Management Science,
+ // 20(5), 863-874, 1974.
+ bool use_approximate_eigenvalue_bfgs_scaling;
+
// Degree of the polynomial used to approximate the objective
// function. Valid values are BISECTION, QUADRATIC and CUBIC.
//
@@ -189,7 +216,7 @@
// value, it is truncated to zero.
double min_line_search_step_size;
- // Armijo line search parameters.
+ // Line search parameters.
// Solving the line search problem exactly is computationally
// prohibitive. Fortunately, line search based optimization
@@ -201,19 +228,63 @@
//
// f(step_size) <= f(0) + sufficient_decrease * f'(0) * step_size
//
- double armijo_sufficient_decrease;
+ double line_search_sufficient_function_decrease;
- // In each iteration of the Armijo line search,
+ // In each iteration of the line search,
//
- // new_step_size >= min_relative_step_size_change * step_size
+ // new_step_size >= max_line_search_step_contraction * step_size
//
- double min_armijo_relative_step_size_change;
+ // Note that by definition, for contraction:
+ //
+ // 0 < max_step_contraction < min_step_contraction < 1
+ //
+ double max_line_search_step_contraction;
- // In each iteration of the Armijo line search,
+ // In each iteration of the line search,
//
- // new_step_size <= max_relative_step_size_change * step_size
+ // new_step_size <= min_line_search_step_contraction * step_size
//
- double max_armijo_relative_step_size_change;
+ // Note that by definition, for contraction:
+ //
+ // 0 < max_step_contraction < min_step_contraction < 1
+ //
+ double min_line_search_step_contraction;
+
+ // Maximum number of trial step size iterations during each line search,
+ // if a step size satisfying the search conditions cannot be found within
+ // this number of trials, the line search will terminate.
+ int max_num_line_search_step_size_iterations;
+
+ // Maximum number of restarts of the line search direction algorithm before
+ // terminating the optimization. Restarts of the line search direction
+ // algorithm occur when the current algorithm fails to produce a new descent
+ // direction. This typically indicates a numerical failure, or a breakdown
+ // in the validity of the approximations used.
+ int max_num_line_search_direction_restarts;
+
+ // The strong Wolfe conditions consist of the Armijo sufficient
+ // decrease condition, and an additional requirement that the
+ // step-size be chosen s.t. the _magnitude_ ('strong' Wolfe
+ // conditions) of the gradient along the search direction
+ // decreases sufficiently. Precisely, this second condition
+ // is that we seek a step_size s.t.
+ //
+ // |f'(step_size)| <= sufficient_curvature_decrease * |f'(0)|
+ //
+ // Where f() is the line search objective and f'() is the derivative
+ // of f w.r.t step_size (d f / d step_size).
+ double line_search_sufficient_curvature_decrease;
+
+ // During the bracketing phase of the Wolfe search, the step size is
+ // increased until either a point satisfying the Wolfe conditions is
+ // found, or an upper bound for a bracket containing a point satisfying
+ // the conditions is found. Precisely, at each iteration of the
+ // expansion:
+ //
+ // new_step_size <= max_step_expansion * step_size.
+ //
+ // By definition for expansion, max_step_expansion > 1.0.
+ double max_line_search_step_expansion;
TrustRegionStrategyType trust_region_strategy_type;
diff --git a/include/ceres/types.h b/include/ceres/types.h
index 5becb53..46c2fd2 100644
--- a/include/ceres/types.h
+++ b/include/ceres/types.h
@@ -37,6 +37,8 @@
#ifndef CERES_PUBLIC_TYPES_H_
#define CERES_PUBLIC_TYPES_H_
+#include <string>
+
#include "ceres/internal/port.h"
namespace ceres {
@@ -167,10 +169,47 @@
// used is determined by NonlinerConjuateGradientType.
NONLINEAR_CONJUGATE_GRADIENT,
- // A limited memory approximation to the inverse Hessian is
- // maintained and used to compute a quasi-Newton step.
+ // BFGS, and it's limited memory approximation L-BFGS, are quasi-Newton
+ // algorithms that approximate the Hessian matrix by iteratively refining
+ // an initial estimate with rank-one updates using the gradient at each
+ // iteration. They are a generalisation of the Secant method and satisfy
+ // the Secant equation. The Secant equation has an infinium of solutions
+ // in multiple dimensions, as there are N*(N+1)/2 degrees of freedom in a
+ // symmetric matrix but only N conditions are specified by the Secant
+ // equation. The requirement that the Hessian approximation be positive
+ // definite imposes another N additional constraints, but that still leaves
+ // remaining degrees-of-freedom. (L)BFGS methods uniquely deteremine the
+ // approximate Hessian by imposing the additional constraints that the
+ // approximation at the next iteration must be the 'closest' to the current
+ // approximation (the nature of how this proximity is measured is actually
+ // the defining difference between a family of quasi-Newton methods including
+ // (L)BFGS & DFP). (L)BFGS is currently regarded as being the best known
+ // general quasi-Newton method.
//
- // For more details see
+ // The principal difference between BFGS and L-BFGS is that whilst BFGS
+ // maintains a full, dense approximation to the (inverse) Hessian, L-BFGS
+ // maintains only a window of the last M observations of the parameters and
+ // gradients. Using this observation history, the calculation of the next
+ // search direction can be computed without requiring the construction of the
+ // full dense inverse Hessian approximation. This is particularly important
+ // for problems with a large number of parameters, where storage of an N-by-N
+ // matrix in memory would be prohibitive.
+ //
+ // For more details on BFGS see:
+ //
+ // Broyden, C.G., "The Convergence of a Class of Double-rank Minimization
+ // Algorithms,"; J. Inst. Maths. Applics., Vol. 6, pp 76–90, 1970.
+ //
+ // Fletcher, R., "A New Approach to Variable Metric Algorithms,"
+ // Computer Journal, Vol. 13, pp 317–322, 1970.
+ //
+ // Goldfarb, D., "A Family of Variable Metric Updates Derived by Variational
+ // Means," Mathematics of Computing, Vol. 24, pp 23–26, 1970.
+ //
+ // Shanno, D.F., "Conditioning of Quasi-Newton Methods for Function
+ // Minimization," Mathematics of Computing, Vol. 24, pp 647–656, 1970.
+ //
+ // For more details on L-BFGS see:
//
// Nocedal, J. (1980). "Updating Quasi-Newton Matrices with Limited
// Storage". Mathematics of Computation 35 (151): 773–782.
@@ -179,7 +218,12 @@
// "Representations of Quasi-Newton Matrices and their use in
// Limited Memory Methods". Mathematical Programming 63 (4):
// 129–156.
+ //
+ // A general reference for both methods:
+ //
+ // Nocedal J., Wright S., Numerical Optimization, 2nd Ed. Springer, 1999.
LBFGS,
+ BFGS,
};
// Nonliner conjugate gradient methods are a generalization of the
@@ -198,6 +242,7 @@
// Backtracking line search with polynomial interpolation or
// bisection.
ARMIJO,
+ WOLFE,
};
// Ceres supports different strategies for computing the trust region