Reduce memory usage in covariance estimation.
When using the SPARSE_QR algorithm, now a Q-less
factorization is used. This results in significantly
less memory usage.
The inversion of the semi-normal equations is now
threaded using openmp. Indeed if one has SuiteSparse
compiled with TBB, then both the factorization
and the inversion are completely threaded.
Change-Id: Ia07591e48e7958d427ef91ff9e67662f6e982c21
diff --git a/internal/ceres/compressed_col_sparse_matrix_utils.h b/internal/ceres/compressed_col_sparse_matrix_utils.h
index afabf1c..494a8a5 100644
--- a/internal/ceres/compressed_col_sparse_matrix_utils.h
+++ b/internal/ceres/compressed_col_sparse_matrix_utils.h
@@ -61,6 +61,81 @@
const vector<int>& block_ordering,
vector<int>* scalar_ordering);
+// Solve the linear system
+//
+// R * solution = rhs
+//
+// Where R is an upper triangular compressed column sparse matrix.
+template <typename IntegerType>
+void SolveUpperTriangularInPlace(IntegerType num_cols,
+ const IntegerType* rows,
+ const IntegerType* cols,
+ const double* values,
+ double* rhs_and_solution) {
+ for (IntegerType c = num_cols - 1; c >= 0; --c) {
+ rhs_and_solution[c] /= values[cols[c + 1] - 1];
+ for (IntegerType idx = cols[c]; idx < cols[c + 1] - 1; ++idx) {
+ const IntegerType r = rows[idx];
+ const double v = values[idx];
+ rhs_and_solution[r] -= v * rhs_and_solution[c];
+ }
+ }
+};
+
+// Solve the linear system
+//
+// R' * solution = rhs
+//
+// Where R is an upper triangular compressed column sparse matrix.
+template <typename IntegerType>
+void SolveUpperTriangularTransposeInPlace(IntegerType num_cols,
+ const IntegerType* rows,
+ const IntegerType* cols,
+ const double* values,
+ double* rhs_and_solution) {
+ for (IntegerType c = 0; c < num_cols; ++c) {
+ for (IntegerType idx = cols[c]; idx < cols[c + 1] - 1; ++idx) {
+ const IntegerType r = rows[idx];
+ const double v = values[idx];
+ rhs_and_solution[c] -= v * rhs_and_solution[r];
+ }
+ rhs_and_solution[c] = rhs_and_solution[c] / values[cols[c + 1] - 1];
+ };
+};
+
+// Given a upper triangular matrix R in compressed column form, solve
+// the linear system,
+//
+// R'R x = b
+//
+// Where b is all zeros except for rhs_nonzero_index, where it is
+// equal to one.
+//
+// The function exploits this knowledge to reduce the number of
+// floating point operations.
+template <typename IntegerType>
+void SolveRTRWithSparseRHS(IntegerType num_cols,
+ const IntegerType* rows,
+ const IntegerType* cols,
+ const double* values,
+ const int rhs_nonzero_index,
+ double* solution) {
+ fill(solution, solution + num_cols, 0.0);
+ solution[rhs_nonzero_index] = 1.0 / values[cols[rhs_nonzero_index + 1] - 1];
+
+ for (IntegerType c = rhs_nonzero_index + 1; c < num_cols; ++c) {
+ for (IntegerType idx = cols[c]; idx < cols[c + 1] - 1; ++idx) {
+ const IntegerType r = rows[idx];
+ if (r < rhs_nonzero_index) continue;
+ const double v = values[idx];
+ solution[c] -= v * solution[r];
+ }
+ solution[c] = solution[c] / values[cols[c + 1] - 1];
+ };
+ SolveUpperTriangularInPlace(num_cols, rows, cols, values, solution);
+};
+
+
} // namespace internal
} // namespace ceres
diff --git a/internal/ceres/compressed_col_sparse_matrix_utils_test.cc b/internal/ceres/compressed_col_sparse_matrix_utils_test.cc
index e810837..3faf06c 100644
--- a/internal/ceres/compressed_col_sparse_matrix_utils_test.cc
+++ b/internal/ceres/compressed_col_sparse_matrix_utils_test.cc
@@ -193,5 +193,92 @@
ss.Free(ccsm.release());
}
+class SolveUpperTriangularTest : public ::testing::Test {
+ protected:
+ void SetUp() {
+ cols.resize(5);
+ rows.resize(7);
+ values.resize(7);
+
+ cols[0] = 0;
+ rows[0] = 0;
+ values[0] = 0.50754;
+
+ cols[1] = 1;
+ rows[1] = 1;
+ values[1] = 0.80483;
+
+ cols[2] = 2;
+ rows[2] = 1;
+ values[2] = 0.14120;
+ rows[3] = 2;
+ values[3] = 0.3;
+
+ cols[3] = 4;
+ rows[4] = 0;
+ values[4] = 0.77696;
+ rows[5] = 1;
+ values[5] = 0.41860;
+ rows[6] = 3;
+ values[6] = 0.88979;
+
+ cols[4] = 7;
+ }
+
+ vector<int> cols;
+ vector<int> rows;
+ vector<double> values;
+};
+
+TEST_F(SolveUpperTriangularTest, SolveInPlace) {
+ double rhs_and_solution[] = {1.0, 1.0, 2.0, 2.0};
+ const double expected[] = { -1.4706, -1.0962, 6.6667, 2.2477};
+
+ SolveUpperTriangularInPlace<int>(cols.size() - 1,
+ &rows[0],
+ &cols[0],
+ &values[0],
+ rhs_and_solution);
+
+ for (int i = 0; i < 4; ++i) {
+ EXPECT_NEAR(rhs_and_solution[i], expected[i], 1e-4) << i;
+ }
+}
+
+TEST_F(SolveUpperTriangularTest, TransposeSolveInPlace) {
+ double rhs_and_solution[] = {1.0, 1.0, 2.0, 2.0};
+ double expected[] = {1.970288, 1.242498, 6.081864, -0.057255};
+
+ SolveUpperTriangularTransposeInPlace<int>(cols.size() - 1,
+ &rows[0],
+ &cols[0],
+ &values[0],
+ rhs_and_solution);
+
+ for (int i = 0; i < 4; ++i) {
+ EXPECT_NEAR(rhs_and_solution[i], expected[i], 1e-4) << i;
+ }
+}
+
+TEST_F(SolveUpperTriangularTest, RTRSolveWithSparseRHS) {
+ double solution[4];
+ double expected[] = { 6.8420e+00, 1.0057e+00, -1.4907e-16, -1.9335e+00,
+ 1.0057e+00, 2.2275e+00, -1.9493e+00, -6.5693e-01,
+ -1.4907e-16, -1.9493e+00, 1.1111e+01, 9.7381e-17,
+ -1.9335e+00, -6.5693e-01, 9.7381e-17, 1.2631e+00 };
+
+ for (int i = 0; i < 4; ++i) {
+ SolveRTRWithSparseRHS<int>(cols.size() - 1,
+ &rows[0],
+ &cols[0],
+ &values[0],
+ i,
+ solution);
+ for (int j = 0; j < 4; ++j) {
+ EXPECT_NEAR(solution[j], expected[4 * i + j], 1e-3) << i;
+ }
+ }
+}
+
} // namespace internal
} // namespace ceres
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc
index 61feb6b..19d545c 100644
--- a/internal/ceres/covariance_impl.cc
+++ b/internal/ceres/covariance_impl.cc
@@ -38,6 +38,7 @@
#include <utility>
#include <vector>
#include "Eigen/SVD"
+#include "ceres/compressed_col_sparse_matrix_utils.h"
#include "ceres/compressed_row_sparse_matrix.h"
#include "ceres/covariance.h"
#include "ceres/crs_matrix.h"
@@ -55,6 +56,7 @@
// Per thread storage for SuiteSparse.
#ifndef CERES_NO_SUITESPARSE
+
struct PerThreadContext {
explicit PerThreadContext(int num_rows)
: solution(NULL),
@@ -80,6 +82,7 @@
cholmod_dense* rhs;
SuiteSparse ss;
};
+
#endif
} // namespace
@@ -605,7 +608,6 @@
vector<SuiteSparse_long> transpose_rows(num_cols + 1, 0);
vector<SuiteSparse_long> transpose_cols(num_nonzeros, 0);
-
vector<double> transpose_values(num_nonzeros, 0);
for (int idx = 0; idx < num_nonzeros; ++idx) {
@@ -650,23 +652,49 @@
cholmod_common cc;
cholmod_l_start(&cc);
- SuiteSparseQR_factorization<double>* factor =
- SuiteSparseQR_factorize<double>(SPQR_ORDERING_BESTAMD,
- SPQR_DEFAULT_TOL,
- &cholmod_jacobian,
- &cc);
- event_logger.AddEvent("Numeric Factorization");
+ cholmod_sparse* R = NULL;
+ SuiteSparse_long* permutation = NULL;
- const int rank = cc.SPQR_istat[4];
+ // Compute a Q-less QR factorization of the Jacobian. Since we are
+ // only interested in inverting J'J = R'R, we do not need Q. This
+ // saves memory and gives us R as a permuted compressed column
+ // sparse matrix.
+ //
+ // TODO(sameeragarwal): Currently the symbolic factorization and the
+ // numeric factorization is done at the same time, and this does not
+ // explicitly account for the block column and row structure in the
+ // matrix. When using AMD, we have observed in the past that
+ // computing the ordering with the block matrix is significantly
+ // more efficient, both in runtime as well as the quality of
+ // ordering computed. So, it maybe worth doing that analysis
+ // separately.
+ const SuiteSparse_long rank =
+ SuiteSparseQR<double>(SPQR_ORDERING_BESTAMD,
+ SPQR_DEFAULT_TOL,
+ cholmod_jacobian.ncol,
+ &cholmod_jacobian,
+ &R,
+ &permutation,
+ &cc);
+ event_logger.AddEvent("Numeric Factorization");
+ CHECK_NOTNULL(permutation);
+ CHECK_NOTNULL(R);
+
if (rank < cholmod_jacobian.ncol) {
LOG(WARNING) << "Jacobian matrix is rank deficient."
<< "Number of columns: " << cholmod_jacobian.ncol
<< " rank: " << rank;
- SuiteSparseQR_free(&factor, &cc);
+ delete []permutation;
+ cholmod_l_free_sparse(&R, &cc);
cholmod_l_finish(&cc);
return false;
}
+ vector<int> inverse_permutation(num_cols);
+ for (SuiteSparse_long i = 0; i < num_cols; ++i) {
+ inverse_permutation[permutation[i]] = i;
+ }
+
const int* rows = covariance_matrix_->rows();
const int* cols = covariance_matrix_->cols();
double* values = covariance_matrix_->mutable_values();
@@ -680,35 +708,39 @@
//
// Since the covariance matrix is symmetric, the i^th row and column
// are equal.
+ const int num_threads = options_.num_threads;
+ scoped_array<double> workspace(new double[num_threads * num_cols]);
- cholmod_dense* rhs = cholmod_l_zeros(num_cols, 1, CHOLMOD_REAL, &cc);
- double* rhs_x = reinterpret_cast<double*>(rhs->x);
-
+#pragma omp parallel for num_threads(num_threads) schedule(dynamic)
for (int r = 0; r < num_cols; ++r) {
- int row_begin = rows[r];
- int row_end = rows[r + 1];
+ const int row_begin = rows[r];
+ const int row_end = rows[r + 1];
if (row_end == row_begin) {
continue;
}
- rhs_x[r] = 1.0;
+# ifdef CERES_USE_OPENMP
+ int thread_id = omp_get_thread_num();
+# else
+ int thread_id = 0;
+# endif
- cholmod_dense* y1 = SuiteSparseQR_solve<double>(SPQR_RTX_EQUALS_ETB, factor, rhs, &cc);
- cholmod_dense* solution = SuiteSparseQR_solve<double>(SPQR_RETX_EQUALS_B, factor, y1, &cc);
-
- double* solution_x = reinterpret_cast<double*>(solution->x);
+ double* solution = workspace.get() + thread_id * num_cols;
+ SolveRTRWithSparseRHS<SuiteSparse_long>(
+ num_cols,
+ static_cast<SuiteSparse_long*>(R->i),
+ static_cast<SuiteSparse_long*>(R->p),
+ static_cast<double*>(R->x),
+ inverse_permutation[r],
+ solution);
for (int idx = row_begin; idx < row_end; ++idx) {
- const int c = cols[idx];
- values[idx] = solution_x[c];
+ const int c = cols[idx];
+ values[idx] = solution[inverse_permutation[c]];
}
-
- cholmod_l_free_dense(&y1, &cc);
- cholmod_l_free_dense(&solution, &cc);
- rhs_x[r] = 0.0;
}
- cholmod_l_free_dense(&rhs, &cc);
- SuiteSparseQR_free(&factor, &cc);
+ delete []permutation;
+ cholmod_l_free_sparse(&R, &cc);
cholmod_l_finish(&cc);
event_logger.AddEvent("Inversion");
return true;
diff --git a/internal/ceres/covariance_test.cc b/internal/ceres/covariance_test.cc
index e7d25a1..f3a5051 100644
--- a/internal/ceres/covariance_test.cc
+++ b/internal/ceres/covariance_test.cc
@@ -499,6 +499,9 @@
#ifndef CERES_NO_SUITESPARSE
options.algorithm_type = SPARSE_CHOLESKY;
ComputeAndCompareCovarianceBlocks(options, expected_covariance);
+
+ options.algorithm_type = SPARSE_QR;
+ ComputeAndCompareCovarianceBlocks(options, expected_covariance);
#endif
options.algorithm_type = DENSE_SVD;
@@ -552,6 +555,9 @@
#ifndef CERES_NO_SUITESPARSE
options.algorithm_type = SPARSE_CHOLESKY;
ComputeAndCompareCovarianceBlocks(options, expected_covariance);
+
+ options.algorithm_type = SPARSE_QR;
+ ComputeAndCompareCovarianceBlocks(options, expected_covariance);
#endif
options.algorithm_type = DENSE_SVD;
@@ -776,6 +782,7 @@
TEST_F(LargeScaleCovarianceTest, Parallel) {
ComputeAndCompare(SPARSE_CHOLESKY, 4);
+ ComputeAndCompare(SPARSE_QR, 4);
}
#endif // !defined(CERES_NO_SUITESPARSE) && defined(CERES_USE_OPENMP)