Adding Wolfe line search algorithm and full BFGS search direction options. Change-Id: I9d3fb117805bdfa5bc33613368f45ae8f10e0d79

commit: 9aa0e3cf7243a2e837bbfa22d4677010463f6a4e [log] [tgz]
author: Alex Stewart <alexs.mac@gmail.com> Fri Jul 05 20:22:37 2013 +0100
committer: Keir Mierle <mierle@gmail.com> Wed Jul 17 08:14:20 2013 +0000
tree: 7a7f5d12f260b4fbce2885a6e0107465be5f6d9a
parent: 51c772c843ccecca006c706a9f64b8cbaf5416f9 [diff]
diff --git a/include/ceres/iteration_callback.h b/include/ceres/iteration_callback.h
index 13140c2..987c2d9 100644
--- a/include/ceres/iteration_callback.h
+++ b/include/ceres/iteration_callback.h

@@ -54,6 +54,8 @@
         eta(0.0),
         step_size(0.0),
         line_search_function_evaluations(0),
+        line_search_gradient_evaluations(0),
+        line_search_iterations(0),
         linear_solver_iterations(0),
         iteration_time_in_seconds(0.0),
         step_solver_time_in_seconds(0.0),
@@ -121,9 +123,15 @@
   // Step sized computed by the line search algorithm.
   double step_size;
 
-  // Number of function evaluations used by the line search algorithm.
+  // Number of function value evaluations used by the line search algorithm.
   int line_search_function_evaluations;
 
+  // Number of function gradient evaluations used by the line search algorithm.
+  int line_search_gradient_evaluations;
+
+  // Number of iterations taken by the line search algorithm.
+  int line_search_iterations;
+
   // Number of iterations taken by the linear solver to solve for the
   // Newton step.
   int linear_solver_iterations;

diff --git a/include/ceres/solver.h b/include/ceres/solver.h
index 47c7e94..e7b8d09 100644
--- a/include/ceres/solver.h
+++ b/include/ceres/solver.h

@@ -60,14 +60,19 @@
     Options() {
       minimizer_type = TRUST_REGION;
       line_search_direction_type = LBFGS;
-      line_search_type = ARMIJO;
+      line_search_type = WOLFE;
       nonlinear_conjugate_gradient_type = FLETCHER_REEVES;
       max_lbfgs_rank = 20;
+      use_approximate_eigenvalue_bfgs_scaling = false;
       line_search_interpolation_type = CUBIC;
       min_line_search_step_size = 1e-9;
-      armijo_sufficient_decrease = 1e-4;
-      min_armijo_relative_step_size_change = 1e-3;
-      max_armijo_relative_step_size_change = 0.6;
+      line_search_sufficient_function_decrease = 1e-4;
+      max_line_search_step_contraction = 1e-3;
+      min_line_search_step_contraction = 0.6;
+      max_num_line_search_step_size_iterations = 20;
+      max_num_line_search_direction_restarts = 5;
+      line_search_sufficient_curvature_decrease = 0.9;
+      max_line_search_step_expansion = 10.0;
 
       trust_region_strategy_type = LEVENBERG_MARQUARDT;
       dogleg_type = TRADITIONAL_DOGLEG;
@@ -178,6 +183,28 @@
     // Limited Storage". Mathematics of Computation 35 (151): 773–782.
     int max_lbfgs_rank;
 
+    // As part of the (L)BFGS update step (BFGS) / right-multiply step (L-BFGS),
+    // the initial inverse Hessian approximation is taken to be the Identity.
+    // However, Oren showed that using instead I * \gamma, where \gamma is
+    // chosen to approximate an eigenvalue of the true inverse Hessian can
+    // result in improved convergence in a wide variety of cases. Setting
+    // use_approximate_eigenvalue_bfgs_scaling to true enables this scaling.
+    //
+    // It is important to note that approximate eigenvalue scaling does not
+    // always improve convergence, and that it can in fact significantly degrade
+    // performance for certain classes of problem, which is why it is disabled
+    // by default.  In particular it can degrade performance when the
+    // sensitivity of the problem to different parameters varies significantly,
+    // as in this case a single scalar factor fails to capture this variation
+    // and detrimentally downscales parts of the jacobian approximation which
+    // correspond to low-sensitivity parameters. It can also reduce the
+    // robustness of the solution to errors in the jacobians.
+    //
+    // Oren S.S., Self-scaling variable metric (SSVM) algorithms
+    // Part II: Implementation and experiments, Management Science,
+    // 20(5), 863-874, 1974.
+    bool use_approximate_eigenvalue_bfgs_scaling;
+
     // Degree of the polynomial used to approximate the objective
     // function. Valid values are BISECTION, QUADRATIC and CUBIC.
     //
@@ -189,7 +216,7 @@
     // value, it is truncated to zero.
     double min_line_search_step_size;
 
-    // Armijo line search parameters.
+    // Line search parameters.
 
     // Solving the line search problem exactly is computationally
     // prohibitive. Fortunately, line search based optimization
@@ -201,19 +228,63 @@
     //
     //   f(step_size) <= f(0) + sufficient_decrease * f'(0) * step_size
     //
-    double armijo_sufficient_decrease;
+    double line_search_sufficient_function_decrease;
 
-    // In each iteration of the Armijo line search,
+    // In each iteration of the line search,
     //
-    //  new_step_size >= min_relative_step_size_change * step_size
+    //  new_step_size >= max_line_search_step_contraction * step_size
     //
-    double min_armijo_relative_step_size_change;
+    // Note that by definition, for contraction:
+    //
+    //  0 < max_step_contraction < min_step_contraction < 1
+    //
+    double max_line_search_step_contraction;
 
-    // In each iteration of the Armijo line search,
+    // In each iteration of the line search,
     //
-    //  new_step_size <= max_relative_step_size_change * step_size
+    //  new_step_size <= min_line_search_step_contraction * step_size
     //
-    double max_armijo_relative_step_size_change;
+    // Note that by definition, for contraction:
+    //
+    //  0 < max_step_contraction < min_step_contraction < 1
+    //
+    double min_line_search_step_contraction;
+
+    // Maximum number of trial step size iterations during each line search,
+    // if a step size satisfying the search conditions cannot be found within
+    // this number of trials, the line search will terminate.
+    int max_num_line_search_step_size_iterations;
+
+    // Maximum number of restarts of the line search direction algorithm before
+    // terminating the optimization. Restarts of the line search direction
+    // algorithm occur when the current algorithm fails to produce a new descent
+    // direction. This typically indicates a numerical failure, or a breakdown
+    // in the validity of the approximations used.
+    int max_num_line_search_direction_restarts;
+
+    // The strong Wolfe conditions consist of the Armijo sufficient
+    // decrease condition, and an additional requirement that the
+    // step-size be chosen s.t. the _magnitude_ ('strong' Wolfe
+    // conditions) of the gradient along the search direction
+    // decreases sufficiently. Precisely, this second condition
+    // is that we seek a step_size s.t.
+    //
+    //   |f'(step_size)| <= sufficient_curvature_decrease * |f'(0)|
+    //
+    // Where f() is the line search objective and f'() is the derivative
+    // of f w.r.t step_size (d f / d step_size).
+    double line_search_sufficient_curvature_decrease;
+
+    // During the bracketing phase of the Wolfe search, the step size is
+    // increased until either a point satisfying the Wolfe conditions is
+    // found, or an upper bound for a bracket containing a point satisfying
+    // the conditions is found.  Precisely, at each iteration of the
+    // expansion:
+    //
+    //   new_step_size <= max_step_expansion * step_size.
+    //
+    // By definition for expansion, max_step_expansion > 1.0.
+    double max_line_search_step_expansion;
 
     TrustRegionStrategyType trust_region_strategy_type;
 

diff --git a/include/ceres/types.h b/include/ceres/types.h
index 5becb53..46c2fd2 100644
--- a/include/ceres/types.h
+++ b/include/ceres/types.h

@@ -37,6 +37,8 @@
 #ifndef CERES_PUBLIC_TYPES_H_
 #define CERES_PUBLIC_TYPES_H_
 
+#include <string>
+
 #include "ceres/internal/port.h"
 
 namespace ceres {
@@ -167,10 +169,47 @@
   // used is determined by NonlinerConjuateGradientType.
   NONLINEAR_CONJUGATE_GRADIENT,
 
-  // A limited memory approximation to the inverse Hessian is
-  // maintained and used to compute a quasi-Newton step.
+  // BFGS, and it's limited memory approximation L-BFGS, are quasi-Newton
+  // algorithms that approximate the Hessian matrix by iteratively refining
+  // an initial estimate with rank-one updates using the gradient at each
+  // iteration. They are a generalisation of the Secant method and satisfy
+  // the Secant equation.  The Secant equation has an infinium of solutions
+  // in multiple dimensions, as there are N*(N+1)/2 degrees of freedom in a
+  // symmetric matrix but only N conditions are specified by the Secant
+  // equation. The requirement that the Hessian approximation be positive
+  // definite imposes another N additional constraints, but that still leaves
+  // remaining degrees-of-freedom.  (L)BFGS methods uniquely deteremine the
+  // approximate Hessian by imposing the additional constraints that the
+  // approximation at the next iteration must be the 'closest' to the current
+  // approximation (the nature of how this proximity is measured is actually
+  // the defining difference between a family of quasi-Newton methods including
+  // (L)BFGS & DFP). (L)BFGS is currently regarded as being the best known
+  // general quasi-Newton method.
   //
-  // For more details see
+  // The principal difference between BFGS and L-BFGS is that whilst BFGS
+  // maintains a full, dense approximation to the (inverse) Hessian, L-BFGS
+  // maintains only a window of the last M observations of the parameters and
+  // gradients. Using this observation history, the calculation of the next
+  // search direction can be computed without requiring the construction of the
+  // full dense inverse Hessian approximation. This is particularly important
+  // for problems with a large number of parameters, where storage of an N-by-N
+  // matrix in memory would be prohibitive.
+  //
+  // For more details on BFGS see:
+  //
+  // Broyden, C.G., "The Convergence of a Class of Double-rank Minimization
+  // Algorithms,"; J. Inst. Maths. Applics., Vol. 6, pp 76–90, 1970.
+  //
+  // Fletcher, R., "A New Approach to Variable Metric Algorithms,"
+  // Computer Journal, Vol. 13, pp 317–322, 1970.
+  //
+  // Goldfarb, D., "A Family of Variable Metric Updates Derived by Variational
+  // Means," Mathematics of Computing, Vol. 24, pp 23–26, 1970.
+  //
+  // Shanno, D.F., "Conditioning of Quasi-Newton Methods for Function
+  // Minimization," Mathematics of Computing, Vol. 24, pp 647–656, 1970.
+  //
+  // For more details on L-BFGS see:
   //
   // Nocedal, J. (1980). "Updating Quasi-Newton Matrices with Limited
   // Storage". Mathematics of Computation 35 (151): 773–782.
@@ -179,7 +218,12 @@
   // "Representations of Quasi-Newton Matrices and their use in
   // Limited Memory Methods". Mathematical Programming 63 (4):
   // 129–156.
+  //
+  // A general reference for both methods:
+  //
+  // Nocedal J., Wright S., Numerical Optimization, 2nd Ed. Springer, 1999.
   LBFGS,
+  BFGS,
 };
 
 // Nonliner conjugate gradient methods are a generalization of the
@@ -198,6 +242,7 @@
   // Backtracking line search with polynomial interpolation or
   // bisection.
   ARMIJO,
+  WOLFE,
 };
 
 // Ceres supports different strategies for computing the trust region
commit	9aa0e3cf7243a2e837bbfa22d4677010463f6a4e	[log] [tgz]
author	Alex Stewart <alexs.mac@gmail.com>	Fri Jul 05 20:22:37 2013 +0100
committer	Keir Mierle <mierle@gmail.com>	Wed Jul 17 08:14:20 2013 +0000
tree	7a7f5d12f260b4fbce2885a6e0107465be5f6d9a
parent	51c772c843ccecca006c706a9f64b8cbaf5416f9 [diff]