Multithread covariance estimation. 1. Multithread the inversion of J'J. 2. Simplify the dense rank truncation loop. 3. Minor correction to building documentation. Change-Id: Ide932811c0f28dc6c253809339fb2caa083865b5
diff --git a/docs/source/building.rst b/docs/source/building.rst index 17e17d4..b91027b 100644 --- a/docs/source/building.rst +++ b/docs/source/building.rst
@@ -72,7 +72,7 @@ .. code-block:: bash # CMake - sudo apt-hey install cmake + sudo apt-get install cmake # gflags tar -xvzf gflags-2.0.tar.gz cd gflags-2.0
diff --git a/include/ceres/covariance.h b/include/ceres/covariance.h index f65d9eb..b093026 100644 --- a/include/ceres/covariance.h +++ b/include/ceres/covariance.h
@@ -51,7 +51,7 @@ // 1. This is experimental code and the API WILL CHANGE before // release. // -// 2. WARNING: It is very easy to use this class incorrectly without +// 2. It is very easy to use this class incorrectly without // understanding the underlying mathematics. Please read and // understand the documentation completely before attempting to use // this class. @@ -166,6 +166,21 @@ // with indeterminacy. IEEE Transactions on Information Theory 47(5): // 2017-2028 (2001) // +// Speed +// ----- +// +// When use_dense_linear_algebra = true, Eigen's JacobiSVD algorithm +// is used to perform the computations. It is an accurate but slow +// method and should only be used for small to moderate sized +// problems. +// +// When use_dense_linear_algebra = false, SuiteSparse/CHOLMOD is used +// to perform the computation. Recent versions of SuiteSparse (>= +// 4.2.0) provide a much more efficient method for solving for rows of +// the covariance matrix. Therefore, if you are doing large scale +// covariance estimation, we strongly recommend using a recent version +// of SuiteSparse. +// // Example Usage // ============= // @@ -248,6 +263,27 @@ // and by default Covariance::Compute will return false if it // encounters such a matrix. // + // use_dense_linear_algebra = false + // -------------------------------- + // + // When performing large scale sparse covariance estimation, + // computing the exact value of the reciprocal condition number is + // not possible as it would require computing the eigenvalues of + // J'J. + // + // In this case we use cholmod_rcond, which uses the ratio of the + // smallest to the largest diagonal entries of the Cholesky + // factorization as an approximation to the reciprocal condition + // number. + // + // However, care must be taken as this is a heuristic and can + // sometimes be a very crude estimate. The default value of + // min_reciprocal_condition_number has been set to a conservative + // value, and sometimes the Covariance::Compute may return false + // even if it is possible to estimate the covariance reliably. In + // such cases, the user should exercise their judgement before + // lowering the value of min_reciprocal_condition_number. + // // use_dense_linear_algebra = true // ------------------------------- //
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc index 966ba35..978acc1 100644 --- a/internal/ceres/covariance_impl.cc +++ b/internal/ceres/covariance_impl.cc
@@ -30,6 +30,10 @@ #include "ceres/covariance_impl.h" +#ifdef CERES_USE_OPENMP +#include <omp.h> +#endif + #include <algorithm> #include <utility> #include <vector> @@ -47,6 +51,36 @@ namespace ceres { namespace internal { +namespace { + +// Per thread storage for SuiteSparse. +struct PerThreadContext { + PerThreadContext(int num_rows) + : solution(NULL), + solution_set(NULL), + y_workspace(NULL), + e_workspace(NULL), + rhs(NULL) { + rhs = ss.CreateDenseVector(NULL, num_rows, num_rows); + } + + ~PerThreadContext() { + ss.Free(solution); + ss.Free(solution_set); + ss.Free(y_workspace); + ss.Free(e_workspace); + ss.Free(rhs); + } + + cholmod_dense* solution; + cholmod_sparse* solution_set; + cholmod_dense* y_workspace; + cholmod_dense* e_workspace; + cholmod_dense* rhs; + SuiteSparse ss; +}; + +} // namespace typedef vector<pair<const double*, const double*> > CovarianceBlocks; @@ -424,9 +458,6 @@ const int* cols = covariance_matrix_->cols(); double* values = covariance_matrix_->mutable_values(); - cholmod_dense* rhs = ss_.CreateDenseVector(NULL, num_rows, num_rows); - double* rhs_x = reinterpret_cast<double*>(rhs->x); - // The following loop exploits the fact that the i^th column of A^{-1} // is given by the solution to the linear system // @@ -441,6 +472,9 @@ // versions of SuiteSparse have the cholmod_solve2 function which // re-uses memory across calls. #if (SUITESPARSE_VERSION < 4002) + cholmod_dense* rhs = ss_.CreateDenseVector(NULL, num_rows, num_rows); + double* rhs_x = reinterpret_cast<double*>(rhs->x); + for (int r = 0; r < num_rows; ++r) { int row_begin = rows[r]; int row_end = rows[r + 1]; @@ -458,15 +492,35 @@ ss_.Free(solution); rhs_x[r] = 0.0; } -#else - // TODO(sameeragarwal) There should be a more efficient way - // involving the use of Bset but I am unable to make it work right - // now. - cholmod_dense* solution = NULL; - cholmod_sparse* solution_set = NULL; - cholmod_dense* y_workspace = NULL; - cholmod_dense* e_workspace = NULL; + ss_.Free(rhs); +#else // SUITESPARSE_VERSION < 4002 + + const int num_threads = options_.num_threads; + vector<PerThreadContext*> contexts(num_threads); + for (int i = 0; i < num_threads; ++i) { + contexts[i] = new PerThreadContext(num_rows); + } + + // The first call to cholmod_solve2 is not thread safe, since it + // changes the factorization from supernodal to simplicial etc. + { + PerThreadContext* context = contexts[0]; + double* context_rhs_x = reinterpret_cast<double*>(context->rhs->x); + context_rhs_x[0] = 1.0; + cholmod_solve2(CHOLMOD_A, + factor, + context->rhs, + NULL, + &context->solution, + &context->solution_set, + &context->y_workspace, + &context->e_workspace, + context->ss.mutable_cc()); + context_rhs_x[0] = 0.0; + } + +#pragma omp parallel for num_threads(num_threads) schedule(dynamic) for (int r = 0; r < num_rows; ++r) { int row_begin = rows[r]; int row_end = rows[r + 1]; @@ -474,40 +528,52 @@ continue; } - rhs_x[r] = 1.0; +# ifdef CERES_USE_OPENMP + int thread_id = omp_get_thread_num(); +# else + int thread_id = 0; +# endif + PerThreadContext* context = contexts[thread_id]; + double* context_rhs_x = reinterpret_cast<double*>(context->rhs->x); + context_rhs_x[r] = 1.0; + + // TODO(sameeragarwal) There should be a more efficient way + // involving the use of Bset but I am unable to make it work right + // now. cholmod_solve2(CHOLMOD_A, factor, - rhs, + context->rhs, NULL, - &solution, - &solution_set, - &y_workspace, - &e_workspace, - ss_.mutable_cc()); + &context->solution, + &context->solution_set, + &context->y_workspace, + &context->e_workspace, + context->ss.mutable_cc()); - double* solution_x = reinterpret_cast<double*>(solution->x); + double* solution_x = reinterpret_cast<double*>(context->solution->x); for (int idx = row_begin; idx < row_end; ++idx) { const int c = cols[idx]; values[idx] = solution_x[c]; } - rhs_x[r] = 0.0; + context_rhs_x[r] = 0.0; } - ss_.Free(solution); - ss_.Free(solution_set); - ss_.Free(y_workspace); - ss_.Free(e_workspace); + for (int i = 0; i < num_threads; ++i) { + delete contexts[i]; + } -#endif +#endif // SUITESPARSE_VERSION < 4002 - ss_.Free(rhs); ss_.Free(factor); event_logger.AddEvent("Inversion"); return true; -#else + +#else // CERES_NO_SUITESPARSE + return false; -#endif + +#endif // CERES_NO_SUITESPARSE }; bool CovarianceImpl::ComputeCovarianceValuesUsingEigen() { @@ -549,19 +615,32 @@ const int max_rank = min(num_singular_values, num_singular_values - options_.null_space_rank); + // Compute the squared inverse of the singular values. Truncate the + // computation based on min_singular_value_ratio and + // null_space_rank. When either of these two quantities are active, + // the resulting covariance matrix is a Moore-Penrose inverse + // instead of a regular inverse. for (int i = 0; i < max_rank; ++i) { const double singular_value_ratio = singular_values[i] / max_singular_value; - if (singular_value_ratio >= min_singular_value_ratio) { - inverse_squared_singular_values[i] = - 1.0 / (singular_values[i] * singular_values[i]); - } else if (!automatic_truncation) { - LOG(WARNING) << "Cholesky factorization of J'J is not reliable. " - << "Reciprocal condition number: " - << singular_value_ratio * singular_value_ratio << " " - << "min_reciprocal_condition_number : " - << options_.min_reciprocal_condition_number; - return false; + if (singular_value_ratio < min_singular_value_ratio) { + // Since the singular values are in decreasing order, if + // automatic truncation is enabled, then from this point on + // all values will fail the ratio test and there is nothing to + // do in this loop. + if (automatic_truncation) { + break; + } else { + LOG(WARNING) << "Cholesky factorization of J'J is not reliable. " + << "Reciprocal condition number: " + << singular_value_ratio * singular_value_ratio << " " + << "min_reciprocal_condition_number : " + << options_.min_reciprocal_condition_number; + return false; + } } + + inverse_squared_singular_values[i] = + 1.0 / (singular_values[i] * singular_values[i]); } Matrix dense_covariance = @@ -585,7 +664,5 @@ return true; }; - - } // namespace internal } // namespace ceres
diff --git a/internal/ceres/covariance_test.cc b/internal/ceres/covariance_test.cc index 05fb234..bd0a0e2 100644 --- a/internal/ceres/covariance_test.cc +++ b/internal/ceres/covariance_test.cc
@@ -405,6 +405,50 @@ ComputeAndCompareCovarianceBlocks(options, expected_covariance); } +#ifdef CERES_USE_OPENMP + +TEST_F(CovarianceTest, ThreadedNormalBehavior) { + // J + // + // 1 0 0 0 0 0 + // 0 1 0 0 0 0 + // 0 0 2 0 0 0 + // 0 0 0 2 0 0 + // 0 0 0 0 2 0 + // 0 0 0 0 0 5 + // -5 -6 1 2 3 0 + // 3 -2 0 0 0 2 + + // J'J + // + // 35 24 -5 -10 -15 6 + // 24 41 -6 -12 -18 -4 + // -5 -6 5 2 3 0 + // -10 -12 2 8 6 0 + // -15 -18 3 6 13 0 + // 6 -4 0 0 0 29 + + // inv(J'J) computed using octave. + double expected_covariance[] = { + 7.0747e-02, -8.4923e-03, 1.6821e-02, 3.3643e-02, 5.0464e-02, -1.5809e-02, // NOLINT + -8.4923e-03, 8.1352e-02, 2.4758e-02, 4.9517e-02, 7.4275e-02, 1.2978e-02, // NOLINT + 1.6821e-02, 2.4758e-02, 2.4904e-01, -1.9271e-03, -2.8906e-03, -6.5325e-05, // NOLINT + 3.3643e-02, 4.9517e-02, -1.9271e-03, 2.4615e-01, -5.7813e-03, -1.3065e-04, // NOLINT + 5.0464e-02, 7.4275e-02, -2.8906e-03, -5.7813e-03, 2.4133e-01, -1.9598e-04, // NOLINT + -1.5809e-02, 1.2978e-02, -6.5325e-05, -1.3065e-04, -1.9598e-04, 3.9544e-02, // NOLINT + }; + + Covariance::Options options; + options.num_threads = 4; + + options.use_dense_linear_algebra = false; + ComputeAndCompareCovarianceBlocks(options, expected_covariance); + + options.use_dense_linear_algebra = true; + ComputeAndCompareCovarianceBlocks(options, expected_covariance); +} + +#endif // CERES_USE_OPENMP TEST_F(CovarianceTest, ConstantParameterBlock) { problem_.SetParameterBlockConstant(parameters_); @@ -644,5 +688,80 @@ ComputeAndCompareCovarianceBlocks(options, expected_covariance); } +class LargeScaleCovarianceTest : public ::testing::Test { + protected: + virtual void SetUp() { + num_parameter_blocks_ = 2000; + parameter_block_size_ = 5; + parameters_.reset(new double[parameter_block_size_ * num_parameter_blocks_]); + + Matrix jacobian(parameter_block_size_, parameter_block_size_); + for (int i = 0; i < num_parameter_blocks_; ++i) { + jacobian.setIdentity(); + jacobian *= (i + 1); + + double* block_i = parameters_.get() + i * parameter_block_size_; + problem_.AddResidualBlock(new UnaryCostFunction(parameter_block_size_, + parameter_block_size_, + jacobian.data()), + NULL, + block_i ); + for (int j = i; j < num_parameter_blocks_; ++j) { + double* block_j = parameters_.get() + j * parameter_block_size_; + all_covariance_blocks_.push_back(make_pair(block_i, block_j)); + } + } + } + + void ComputeAndCompare(int num_threads) { + Covariance::Options options; + options.use_dense_linear_algebra = false; + options.num_threads = num_threads; + Covariance covariance(options); + EXPECT_TRUE(covariance.Compute(all_covariance_blocks_, &problem_)); + + Matrix expected(parameter_block_size_, parameter_block_size_); + Matrix actual(parameter_block_size_, parameter_block_size_); + const double kTolerance = 1e-16; + + for (int i = 0; i < num_parameter_blocks_; ++i) { + expected.setIdentity(); + expected /= (i + 1.0) * (i + 1.0); + + double* block_i = parameters_.get() + i * parameter_block_size_; + covariance.GetCovarianceBlock(block_i, block_i, actual.data()); + EXPECT_NEAR((expected - actual).norm(), 0.0, kTolerance) + << "block: " << i << ", " << i << "\n" + << "expected: \n" << expected << "\n" + << "actual: \n" << actual; + + expected.setZero(); + for (int j = i + 1; j < num_parameter_blocks_; ++j) { + double* block_j = parameters_.get() + j * parameter_block_size_; + covariance.GetCovarianceBlock(block_i, block_j, actual.data()); + EXPECT_NEAR((expected - actual).norm(), 0.0, kTolerance) + << "block: " << i << ", " << j << "\n" + << "expected: \n" << expected << "\n" + << "actual: \n" << actual; + } + } + } + + scoped_array<double> parameters_; + int parameter_block_size_; + int num_parameter_blocks_; + + Problem problem_; + vector<pair<const double*, const double*> > all_covariance_blocks_; +}; + +#if !defined(CERES_NO_SUITESPARSE) && defined(CERES_USE_OPENMP) + +TEST_F(LargeScaleCovarianceTest, Parallel) { + ComputeAndCompare(4); +} + +#endif // !defined(CERES_NO_SUITESPARSE) && defined(CERES_USE_OPENMP) + } // namespace internal } // namespace ceres