Blame - internal/ceres/low_rank_inverse_hessian.cc - ceres-solver

blob: 6a925280eb0d18272585d61119cff7d46f180c0f [file] [log] [blame]

Sameer Agarwal	3e8d192	2012-11-28 17:20:22 -0800	[diff] [blame]	1	// Ceres Solver - A fast non-linear least squares minimizer
				2	// Copyright 2012 Google Inc. All rights reserved.
				3	// http://code.google.com/p/ceres-solver/
				4	//
				5	// Redistribution and use in source and binary forms, with or without
				6	// modification, are permitted provided that the following conditions are met:
				7	//
				8	// * Redistributions of source code must retain the above copyright notice,
				9	// this list of conditions and the following disclaimer.
				10	// * Redistributions in binary form must reproduce the above copyright notice,
				11	// this list of conditions and the following disclaimer in the documentation
				12	// and/or other materials provided with the distribution.
				13	// * Neither the name of Google Inc. nor the names of its contributors may be
				14	// used to endorse or promote products derived from this software without
				15	// specific prior written permission.
				16	//
				17	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				18	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				19	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				20	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				21	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				22	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				23	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				24	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				25	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				26	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				27	// POSSIBILITY OF SUCH DAMAGE.
				28	//
				29	// Author: sameeragarwal@google.com (Sameer Agarwal)
				30
Sameer Agarwal	3e8d192	2012-11-28 17:20:22 -0800	[diff] [blame]	31	#include "ceres/internal/eigen.h"
Sameer Agarwal	9883fc3	2012-11-30 12:32:43 -0800	[diff] [blame]	32	#include "ceres/low_rank_inverse_hessian.h"
				33	#include "glog/logging.h"
Sameer Agarwal	3e8d192	2012-11-28 17:20:22 -0800	[diff] [blame]	34
				35	namespace ceres {
				36	namespace internal {
				37
Alex Stewart	3fca2c4	2013-11-18 10:26:49 +0000	[diff] [blame^]	38	// The (L)BFGS algorithm explicitly requires that the secant equation:
				39	//
				40	// B_{k+1} * s_k = y_k
				41	//
				42	// Is satisfied at each iteration, where B_{k+1} is the approximated
				43	// Hessian at the k+1-th iteration, s_k = (x_{k+1} - x_{k}) and
				44	// y_k = (grad_{k+1} - grad_{k}). As the approximated Hessian must be
				45	// positive definite, this is equivalent to the condition:
				46	//
				47	// s_k^T * y_k > 0 [s_k^T * B_{k+1} * s_k = s_k^T * y_k > 0]
				48	//
				49	// This condition would always be satisfied if the function was strictly
				50	// convex, alternatively, it is always satisfied provided that a Wolfe line
				51	// search is used (even if the function is not strictly convex). See [1]
				52	// (p138) for a proof.
				53	//
				54	// Although Ceres will always use a Wolfe line search when using (L)BFGS,
				55	// practical implementation considerations mean that the line search
				56	// may return a point that satisfies only the Armijo condition, and thus
				57	// could violate the Secant equation. As such, we will only use a step
				58	// to update the Hessian approximation if:
				59	//
				60	// s_k^T * y_k > tolerance
				61	//
				62	// It is important that tolerance is very small (and >=0), as otherwise we
				63	// might skip the update too often and fail to capture important curvature
				64	// information in the Hessian. For example going from 1e-10 -> 1e-14 improves
				65	// the NIST benchmark score from 43/54 to 53/54.
				66	//
				67	// [1] Nocedal J., Wright S., Numerical Optimization, 2nd Ed. Springer, 1999.
				68	//
				69	// TODO: Consider using Damped BFGS update instead of skipping update.
				70	const double kLBFGSSecantConditionHessianUpdateTolerance = 1e-14;
				71
Alex Stewart	9aa0e3c	2013-07-05 20:22:37 +0100	[diff] [blame]	72	LowRankInverseHessian::LowRankInverseHessian(
				73	int num_parameters,
				74	int max_num_corrections,
				75	bool use_approximate_eigenvalue_scaling)
Sameer Agarwal	3e8d192	2012-11-28 17:20:22 -0800	[diff] [blame]	76	: num_parameters_(num_parameters),
				77	max_num_corrections_(max_num_corrections),
Alex Stewart	9aa0e3c	2013-07-05 20:22:37 +0100	[diff] [blame]	78	use_approximate_eigenvalue_scaling_(use_approximate_eigenvalue_scaling),
Sameer Agarwal	3e8d192	2012-11-28 17:20:22 -0800	[diff] [blame]	79	num_corrections_(0),
Alex Stewart	9aa0e3c	2013-07-05 20:22:37 +0100	[diff] [blame]	80	approximate_eigenvalue_scale_(1.0),
Sameer Agarwal	3e8d192	2012-11-28 17:20:22 -0800	[diff] [blame]	81	delta_x_history_(num_parameters, max_num_corrections),
				82	delta_gradient_history_(num_parameters, max_num_corrections),
				83	delta_x_dot_delta_gradient_(max_num_corrections) {
				84	}
				85
Sameer Agarwal	9883fc3	2012-11-30 12:32:43 -0800	[diff] [blame]	86	bool LowRankInverseHessian::Update(const Vector& delta_x,
				87	const Vector& delta_gradient) {
Sameer Agarwal	3e8d192	2012-11-28 17:20:22 -0800	[diff] [blame]	88	const double delta_x_dot_delta_gradient = delta_x.dot(delta_gradient);
Alex Stewart	3fca2c4	2013-11-18 10:26:49 +0000	[diff] [blame^]	89	if (delta_x_dot_delta_gradient <=
				90	kLBFGSSecantConditionHessianUpdateTolerance) {
				91	LOG(WARNING) << "Skipping L-BFGS Update, delta_x_dot_delta_gradient too "
				92	<< "small: " << delta_x_dot_delta_gradient << ", tolerance: "
				93	<< kLBFGSSecantConditionHessianUpdateTolerance
				94	<< " (Secant condition).";
Sameer Agarwal	3e8d192	2012-11-28 17:20:22 -0800	[diff] [blame]	95	return false;
				96	}
				97
				98	if (num_corrections_ == max_num_corrections_) {
				99	// TODO(sameeragarwal): This can be done more efficiently using
				100	// a circular buffer/indexing scheme, but for simplicity we will
				101	// do the expensive copy for now.
Alex Stewart	70b06c8	2013-06-30 18:49:56 +0100	[diff] [blame]	102	delta_x_history_.block(0, 0, num_parameters_, max_num_corrections_ - 1) =
Sameer Agarwal	3e8d192	2012-11-28 17:20:22 -0800	[diff] [blame]	103	delta_x_history_
				104	.block(0, 1, num_parameters_, max_num_corrections_ - 1);
				105
				106	delta_gradient_history_
Alex Stewart	70b06c8	2013-06-30 18:49:56 +0100	[diff] [blame]	107	.block(0, 0, num_parameters_, max_num_corrections_ - 1) =
Sameer Agarwal	3e8d192	2012-11-28 17:20:22 -0800	[diff] [blame]	108	delta_gradient_history_
				109	.block(0, 1, num_parameters_, max_num_corrections_ - 1);
				110
Alex Stewart	70b06c8	2013-06-30 18:49:56 +0100	[diff] [blame]	111	delta_x_dot_delta_gradient_.head(num_corrections_ - 1) =
Sameer Agarwal	3e8d192	2012-11-28 17:20:22 -0800	[diff] [blame]	112	delta_x_dot_delta_gradient_.tail(num_corrections_ - 1);
				113	} else {
				114	++num_corrections_;
				115	}
				116
				117	delta_x_history_.col(num_corrections_ - 1) = delta_x;
				118	delta_gradient_history_.col(num_corrections_ - 1) = delta_gradient;
				119	delta_x_dot_delta_gradient_(num_corrections_ - 1) =
				120	delta_x_dot_delta_gradient;
Alex Stewart	9aa0e3c	2013-07-05 20:22:37 +0100	[diff] [blame]	121	approximate_eigenvalue_scale_ =
				122	delta_x_dot_delta_gradient / delta_gradient.squaredNorm();
Sameer Agarwal	3e8d192	2012-11-28 17:20:22 -0800	[diff] [blame]	123	return true;
				124	}
				125
Sameer Agarwal	9883fc3	2012-11-30 12:32:43 -0800	[diff] [blame]	126	void LowRankInverseHessian::RightMultiply(const double* x_ptr,
				127	double* y_ptr) const {
Sameer Agarwal	3e8d192	2012-11-28 17:20:22 -0800	[diff] [blame]	128	ConstVectorRef gradient(x_ptr, num_parameters_);
				129	VectorRef search_direction(y_ptr, num_parameters_);
				130
				131	search_direction = gradient;
				132
				133	Vector alpha(num_corrections_);
				134
				135	for (int i = num_corrections_ - 1; i >= 0; --i) {
				136	alpha(i) = delta_x_history_.col(i).dot(search_direction) /
				137	delta_x_dot_delta_gradient_(i);
				138	search_direction -= alpha(i) * delta_gradient_history_.col(i);
				139	}
				140
Alex Stewart	9aa0e3c	2013-07-05 20:22:37 +0100	[diff] [blame]	141	if (use_approximate_eigenvalue_scaling_) {
				142	// Rescale the initial inverse Hessian approximation (H_0) to be iteratively
				143	// updated so that it is of similar 'size' to the true inverse Hessian along
				144	// the most recent search direction. As shown in [1]:
				145	//
				146	// \gamma_k = (delta_gradient_{k-1}' * delta_x_{k-1}) /
				147	// (delta_gradient_{k-1}' * delta_gradient_{k-1})
				148	//
				149	// Satisfies:
				150	//
				151	// (1 / \lambda_m) <= \gamma_k <= (1 / \lambda_1)
				152	//
				153	// Where \lambda_1 & \lambda_m are the smallest and largest eigenvalues of
				154	// the true Hessian (not the inverse) along the most recent search direction
				155	// respectively. Thus \gamma is an approximate eigenvalue of the true
				156	// inverse Hessian, and choosing: H_0 = I * \gamma will yield a starting
				157	// point that has a similar scale to the true inverse Hessian. This
				158	// technique is widely reported to often improve convergence, however this
				159	// is not universally true, particularly if there are errors in the initial
				160	// jacobians, or if there are significant differences in the sensitivity
				161	// of the problem to the parameters (i.e. the range of the magnitudes of
				162	// the components of the gradient is large).
				163	//
				164	// The original origin of this rescaling trick is somewhat unclear, the
				165	// earliest reference appears to be Oren [1], however it is widely discussed
				166	// without specific attributation in various texts including [2] (p143/178).
				167	//
				168	// [1] Oren S.S., Self-scaling variable metric (SSVM) algorithms Part II:
				169	// Implementation and experiments, Management Science,
				170	// 20(5), 863-874, 1974.
				171	// [2] Nocedal J., Wright S., Numerical Optimization, Springer, 1999.
				172	search_direction *= approximate_eigenvalue_scale_;
				173	}
Sameer Agarwal	3e8d192	2012-11-28 17:20:22 -0800	[diff] [blame]	174
				175	for (int i = 0; i < num_corrections_; ++i) {
				176	const double beta = delta_gradient_history_.col(i).dot(search_direction) /
				177	delta_x_dot_delta_gradient_(i);
				178	search_direction += delta_x_history_.col(i) * (alpha(i) - beta);
				179	}
				180	}
				181
				182	} // namespace internal
				183	} // namespace ceres