Reduce memory usage in covariance estimation. When using the SPARSE_QR algorithm, now a Q-less factorization is used. This results in significantly less memory usage. The inversion of the semi-normal equations is now threaded using openmp. Indeed if one has SuiteSparse compiled with TBB, then both the factorization and the inversion are completely threaded. Change-Id: Ia07591e48e7958d427ef91ff9e67662f6e982c21
diff --git a/internal/ceres/compressed_col_sparse_matrix_utils.h b/internal/ceres/compressed_col_sparse_matrix_utils.h index afabf1c..494a8a5 100644 --- a/internal/ceres/compressed_col_sparse_matrix_utils.h +++ b/internal/ceres/compressed_col_sparse_matrix_utils.h
@@ -61,6 +61,81 @@ const vector<int>& block_ordering, vector<int>* scalar_ordering); +// Solve the linear system +// +// R * solution = rhs +// +// Where R is an upper triangular compressed column sparse matrix. +template <typename IntegerType> +void SolveUpperTriangularInPlace(IntegerType num_cols, + const IntegerType* rows, + const IntegerType* cols, + const double* values, + double* rhs_and_solution) { + for (IntegerType c = num_cols - 1; c >= 0; --c) { + rhs_and_solution[c] /= values[cols[c + 1] - 1]; + for (IntegerType idx = cols[c]; idx < cols[c + 1] - 1; ++idx) { + const IntegerType r = rows[idx]; + const double v = values[idx]; + rhs_and_solution[r] -= v * rhs_and_solution[c]; + } + } +}; + +// Solve the linear system +// +// R' * solution = rhs +// +// Where R is an upper triangular compressed column sparse matrix. +template <typename IntegerType> +void SolveUpperTriangularTransposeInPlace(IntegerType num_cols, + const IntegerType* rows, + const IntegerType* cols, + const double* values, + double* rhs_and_solution) { + for (IntegerType c = 0; c < num_cols; ++c) { + for (IntegerType idx = cols[c]; idx < cols[c + 1] - 1; ++idx) { + const IntegerType r = rows[idx]; + const double v = values[idx]; + rhs_and_solution[c] -= v * rhs_and_solution[r]; + } + rhs_and_solution[c] = rhs_and_solution[c] / values[cols[c + 1] - 1]; + }; +}; + +// Given a upper triangular matrix R in compressed column form, solve +// the linear system, +// +// R'R x = b +// +// Where b is all zeros except for rhs_nonzero_index, where it is +// equal to one. +// +// The function exploits this knowledge to reduce the number of +// floating point operations. +template <typename IntegerType> +void SolveRTRWithSparseRHS(IntegerType num_cols, + const IntegerType* rows, + const IntegerType* cols, + const double* values, + const int rhs_nonzero_index, + double* solution) { + fill(solution, solution + num_cols, 0.0); + solution[rhs_nonzero_index] = 1.0 / values[cols[rhs_nonzero_index + 1] - 1]; + + for (IntegerType c = rhs_nonzero_index + 1; c < num_cols; ++c) { + for (IntegerType idx = cols[c]; idx < cols[c + 1] - 1; ++idx) { + const IntegerType r = rows[idx]; + if (r < rhs_nonzero_index) continue; + const double v = values[idx]; + solution[c] -= v * solution[r]; + } + solution[c] = solution[c] / values[cols[c + 1] - 1]; + }; + SolveUpperTriangularInPlace(num_cols, rows, cols, values, solution); +}; + + } // namespace internal } // namespace ceres
diff --git a/internal/ceres/compressed_col_sparse_matrix_utils_test.cc b/internal/ceres/compressed_col_sparse_matrix_utils_test.cc index e810837..3faf06c 100644 --- a/internal/ceres/compressed_col_sparse_matrix_utils_test.cc +++ b/internal/ceres/compressed_col_sparse_matrix_utils_test.cc
@@ -193,5 +193,92 @@ ss.Free(ccsm.release()); } +class SolveUpperTriangularTest : public ::testing::Test { + protected: + void SetUp() { + cols.resize(5); + rows.resize(7); + values.resize(7); + + cols[0] = 0; + rows[0] = 0; + values[0] = 0.50754; + + cols[1] = 1; + rows[1] = 1; + values[1] = 0.80483; + + cols[2] = 2; + rows[2] = 1; + values[2] = 0.14120; + rows[3] = 2; + values[3] = 0.3; + + cols[3] = 4; + rows[4] = 0; + values[4] = 0.77696; + rows[5] = 1; + values[5] = 0.41860; + rows[6] = 3; + values[6] = 0.88979; + + cols[4] = 7; + } + + vector<int> cols; + vector<int> rows; + vector<double> values; +}; + +TEST_F(SolveUpperTriangularTest, SolveInPlace) { + double rhs_and_solution[] = {1.0, 1.0, 2.0, 2.0}; + const double expected[] = { -1.4706, -1.0962, 6.6667, 2.2477}; + + SolveUpperTriangularInPlace<int>(cols.size() - 1, + &rows[0], + &cols[0], + &values[0], + rhs_and_solution); + + for (int i = 0; i < 4; ++i) { + EXPECT_NEAR(rhs_and_solution[i], expected[i], 1e-4) << i; + } +} + +TEST_F(SolveUpperTriangularTest, TransposeSolveInPlace) { + double rhs_and_solution[] = {1.0, 1.0, 2.0, 2.0}; + double expected[] = {1.970288, 1.242498, 6.081864, -0.057255}; + + SolveUpperTriangularTransposeInPlace<int>(cols.size() - 1, + &rows[0], + &cols[0], + &values[0], + rhs_and_solution); + + for (int i = 0; i < 4; ++i) { + EXPECT_NEAR(rhs_and_solution[i], expected[i], 1e-4) << i; + } +} + +TEST_F(SolveUpperTriangularTest, RTRSolveWithSparseRHS) { + double solution[4]; + double expected[] = { 6.8420e+00, 1.0057e+00, -1.4907e-16, -1.9335e+00, + 1.0057e+00, 2.2275e+00, -1.9493e+00, -6.5693e-01, + -1.4907e-16, -1.9493e+00, 1.1111e+01, 9.7381e-17, + -1.9335e+00, -6.5693e-01, 9.7381e-17, 1.2631e+00 }; + + for (int i = 0; i < 4; ++i) { + SolveRTRWithSparseRHS<int>(cols.size() - 1, + &rows[0], + &cols[0], + &values[0], + i, + solution); + for (int j = 0; j < 4; ++j) { + EXPECT_NEAR(solution[j], expected[4 * i + j], 1e-3) << i; + } + } +} + } // namespace internal } // namespace ceres
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc index 61feb6b..19d545c 100644 --- a/internal/ceres/covariance_impl.cc +++ b/internal/ceres/covariance_impl.cc
@@ -38,6 +38,7 @@ #include <utility> #include <vector> #include "Eigen/SVD" +#include "ceres/compressed_col_sparse_matrix_utils.h" #include "ceres/compressed_row_sparse_matrix.h" #include "ceres/covariance.h" #include "ceres/crs_matrix.h" @@ -55,6 +56,7 @@ // Per thread storage for SuiteSparse. #ifndef CERES_NO_SUITESPARSE + struct PerThreadContext { explicit PerThreadContext(int num_rows) : solution(NULL), @@ -80,6 +82,7 @@ cholmod_dense* rhs; SuiteSparse ss; }; + #endif } // namespace @@ -605,7 +608,6 @@ vector<SuiteSparse_long> transpose_rows(num_cols + 1, 0); vector<SuiteSparse_long> transpose_cols(num_nonzeros, 0); - vector<double> transpose_values(num_nonzeros, 0); for (int idx = 0; idx < num_nonzeros; ++idx) { @@ -650,23 +652,49 @@ cholmod_common cc; cholmod_l_start(&cc); - SuiteSparseQR_factorization<double>* factor = - SuiteSparseQR_factorize<double>(SPQR_ORDERING_BESTAMD, - SPQR_DEFAULT_TOL, - &cholmod_jacobian, - &cc); - event_logger.AddEvent("Numeric Factorization"); + cholmod_sparse* R = NULL; + SuiteSparse_long* permutation = NULL; - const int rank = cc.SPQR_istat[4]; + // Compute a Q-less QR factorization of the Jacobian. Since we are + // only interested in inverting J'J = R'R, we do not need Q. This + // saves memory and gives us R as a permuted compressed column + // sparse matrix. + // + // TODO(sameeragarwal): Currently the symbolic factorization and the + // numeric factorization is done at the same time, and this does not + // explicitly account for the block column and row structure in the + // matrix. When using AMD, we have observed in the past that + // computing the ordering with the block matrix is significantly + // more efficient, both in runtime as well as the quality of + // ordering computed. So, it maybe worth doing that analysis + // separately. + const SuiteSparse_long rank = + SuiteSparseQR<double>(SPQR_ORDERING_BESTAMD, + SPQR_DEFAULT_TOL, + cholmod_jacobian.ncol, + &cholmod_jacobian, + &R, + &permutation, + &cc); + event_logger.AddEvent("Numeric Factorization"); + CHECK_NOTNULL(permutation); + CHECK_NOTNULL(R); + if (rank < cholmod_jacobian.ncol) { LOG(WARNING) << "Jacobian matrix is rank deficient." << "Number of columns: " << cholmod_jacobian.ncol << " rank: " << rank; - SuiteSparseQR_free(&factor, &cc); + delete []permutation; + cholmod_l_free_sparse(&R, &cc); cholmod_l_finish(&cc); return false; } + vector<int> inverse_permutation(num_cols); + for (SuiteSparse_long i = 0; i < num_cols; ++i) { + inverse_permutation[permutation[i]] = i; + } + const int* rows = covariance_matrix_->rows(); const int* cols = covariance_matrix_->cols(); double* values = covariance_matrix_->mutable_values(); @@ -680,35 +708,39 @@ // // Since the covariance matrix is symmetric, the i^th row and column // are equal. + const int num_threads = options_.num_threads; + scoped_array<double> workspace(new double[num_threads * num_cols]); - cholmod_dense* rhs = cholmod_l_zeros(num_cols, 1, CHOLMOD_REAL, &cc); - double* rhs_x = reinterpret_cast<double*>(rhs->x); - +#pragma omp parallel for num_threads(num_threads) schedule(dynamic) for (int r = 0; r < num_cols; ++r) { - int row_begin = rows[r]; - int row_end = rows[r + 1]; + const int row_begin = rows[r]; + const int row_end = rows[r + 1]; if (row_end == row_begin) { continue; } - rhs_x[r] = 1.0; +# ifdef CERES_USE_OPENMP + int thread_id = omp_get_thread_num(); +# else + int thread_id = 0; +# endif - cholmod_dense* y1 = SuiteSparseQR_solve<double>(SPQR_RTX_EQUALS_ETB, factor, rhs, &cc); - cholmod_dense* solution = SuiteSparseQR_solve<double>(SPQR_RETX_EQUALS_B, factor, y1, &cc); - - double* solution_x = reinterpret_cast<double*>(solution->x); + double* solution = workspace.get() + thread_id * num_cols; + SolveRTRWithSparseRHS<SuiteSparse_long>( + num_cols, + static_cast<SuiteSparse_long*>(R->i), + static_cast<SuiteSparse_long*>(R->p), + static_cast<double*>(R->x), + inverse_permutation[r], + solution); for (int idx = row_begin; idx < row_end; ++idx) { - const int c = cols[idx]; - values[idx] = solution_x[c]; + const int c = cols[idx]; + values[idx] = solution[inverse_permutation[c]]; } - - cholmod_l_free_dense(&y1, &cc); - cholmod_l_free_dense(&solution, &cc); - rhs_x[r] = 0.0; } - cholmod_l_free_dense(&rhs, &cc); - SuiteSparseQR_free(&factor, &cc); + delete []permutation; + cholmod_l_free_sparse(&R, &cc); cholmod_l_finish(&cc); event_logger.AddEvent("Inversion"); return true;
diff --git a/internal/ceres/covariance_test.cc b/internal/ceres/covariance_test.cc index e7d25a1..f3a5051 100644 --- a/internal/ceres/covariance_test.cc +++ b/internal/ceres/covariance_test.cc
@@ -499,6 +499,9 @@ #ifndef CERES_NO_SUITESPARSE options.algorithm_type = SPARSE_CHOLESKY; ComputeAndCompareCovarianceBlocks(options, expected_covariance); + + options.algorithm_type = SPARSE_QR; + ComputeAndCompareCovarianceBlocks(options, expected_covariance); #endif options.algorithm_type = DENSE_SVD; @@ -552,6 +555,9 @@ #ifndef CERES_NO_SUITESPARSE options.algorithm_type = SPARSE_CHOLESKY; ComputeAndCompareCovarianceBlocks(options, expected_covariance); + + options.algorithm_type = SPARSE_QR; + ComputeAndCompareCovarianceBlocks(options, expected_covariance); #endif options.algorithm_type = DENSE_SVD; @@ -776,6 +782,7 @@ TEST_F(LargeScaleCovarianceTest, Parallel) { ComputeAndCompare(SPARSE_CHOLESKY, 4); + ComputeAndCompare(SPARSE_QR, 4); } #endif // !defined(CERES_NO_SUITESPARSE) && defined(CERES_USE_OPENMP)