Reduce memory usage in covariance estimation.

When using the SPARSE_QR algorithm, now a Q-less
factorization is used. This results in significantly
less memory usage.

The inversion of the semi-normal equations is now
threaded using openmp. Indeed if one has SuiteSparse
compiled with TBB, then both the factorization
and the inversion are completely threaded.

Change-Id: Ia07591e48e7958d427ef91ff9e67662f6e982c21
diff --git a/internal/ceres/compressed_col_sparse_matrix_utils.h b/internal/ceres/compressed_col_sparse_matrix_utils.h
index afabf1c..494a8a5 100644
--- a/internal/ceres/compressed_col_sparse_matrix_utils.h
+++ b/internal/ceres/compressed_col_sparse_matrix_utils.h
@@ -61,6 +61,81 @@
                                    const vector<int>& block_ordering,
                                    vector<int>* scalar_ordering);
 
+// Solve the linear system
+//
+//   R * solution = rhs
+//
+// Where R is an upper triangular compressed column sparse matrix.
+template <typename IntegerType>
+void SolveUpperTriangularInPlace(IntegerType num_cols,
+                                 const IntegerType* rows,
+                                 const IntegerType* cols,
+                                 const double* values,
+                                 double* rhs_and_solution) {
+  for (IntegerType c = num_cols - 1; c >= 0; --c) {
+    rhs_and_solution[c] /= values[cols[c + 1] - 1];
+    for (IntegerType idx = cols[c]; idx < cols[c + 1] - 1; ++idx) {
+      const IntegerType r = rows[idx];
+      const double v = values[idx];
+      rhs_and_solution[r] -= v * rhs_and_solution[c];
+    }
+  }
+};
+
+// Solve the linear system
+//
+//   R' * solution = rhs
+//
+// Where R is an upper triangular compressed column sparse matrix.
+template <typename IntegerType>
+void SolveUpperTriangularTransposeInPlace(IntegerType num_cols,
+                                          const IntegerType* rows,
+                                          const IntegerType* cols,
+                                          const double* values,
+                                          double* rhs_and_solution) {
+  for (IntegerType c = 0; c < num_cols; ++c) {
+    for (IntegerType idx = cols[c]; idx < cols[c + 1] - 1; ++idx) {
+      const IntegerType r = rows[idx];
+      const double v = values[idx];
+      rhs_and_solution[c] -= v * rhs_and_solution[r];
+    }
+    rhs_and_solution[c] =  rhs_and_solution[c] / values[cols[c + 1] - 1];
+  };
+};
+
+// Given a upper triangular matrix R in compressed column form, solve
+// the linear system,
+//
+//  R'R x = b
+//
+// Where b is all zeros except for rhs_nonzero_index, where it is
+// equal to one.
+//
+// The function exploits this knowledge to reduce the number of
+// floating point operations.
+template <typename IntegerType>
+void SolveRTRWithSparseRHS(IntegerType num_cols,
+                           const IntegerType* rows,
+                           const IntegerType* cols,
+                           const double* values,
+                           const int rhs_nonzero_index,
+                           double* solution) {
+  fill(solution, solution + num_cols, 0.0);
+  solution[rhs_nonzero_index] = 1.0 / values[cols[rhs_nonzero_index + 1] - 1];
+
+  for (IntegerType c = rhs_nonzero_index + 1; c < num_cols; ++c) {
+    for (IntegerType idx = cols[c]; idx < cols[c + 1] - 1; ++idx) {
+      const IntegerType r = rows[idx];
+      if (r < rhs_nonzero_index) continue;
+      const double v = values[idx];
+      solution[c] -= v * solution[r];
+    }
+    solution[c] =  solution[c] / values[cols[c + 1] - 1];
+  };
+  SolveUpperTriangularInPlace(num_cols, rows, cols, values, solution);
+};
+
+
 }  // namespace internal
 }  // namespace ceres
 
diff --git a/internal/ceres/compressed_col_sparse_matrix_utils_test.cc b/internal/ceres/compressed_col_sparse_matrix_utils_test.cc
index e810837..3faf06c 100644
--- a/internal/ceres/compressed_col_sparse_matrix_utils_test.cc
+++ b/internal/ceres/compressed_col_sparse_matrix_utils_test.cc
@@ -193,5 +193,92 @@
   ss.Free(ccsm.release());
 }
 
+class SolveUpperTriangularTest : public ::testing::Test {
+ protected:
+  void SetUp() {
+    cols.resize(5);
+    rows.resize(7);
+    values.resize(7);
+
+    cols[0] = 0;
+    rows[0] = 0;
+    values[0] = 0.50754;
+
+    cols[1] = 1;
+    rows[1] = 1;
+    values[1] = 0.80483;
+
+    cols[2] = 2;
+    rows[2] = 1;
+    values[2] = 0.14120;
+    rows[3] = 2;
+    values[3] = 0.3;
+
+    cols[3] = 4;
+    rows[4] = 0;
+    values[4] = 0.77696;
+    rows[5] = 1;
+    values[5] = 0.41860;
+    rows[6] = 3;
+    values[6] = 0.88979;
+
+    cols[4] = 7;
+  }
+
+  vector<int> cols;
+  vector<int> rows;
+  vector<double> values;
+};
+
+TEST_F(SolveUpperTriangularTest, SolveInPlace) {
+  double rhs_and_solution[] = {1.0, 1.0, 2.0, 2.0};
+  const double expected[] = { -1.4706, -1.0962, 6.6667, 2.2477};
+
+  SolveUpperTriangularInPlace<int>(cols.size() - 1,
+                                   &rows[0],
+                                   &cols[0],
+                                   &values[0],
+                                   rhs_and_solution);
+
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_NEAR(rhs_and_solution[i], expected[i], 1e-4) << i;
+  }
+}
+
+TEST_F(SolveUpperTriangularTest, TransposeSolveInPlace) {
+  double rhs_and_solution[] = {1.0, 1.0, 2.0, 2.0};
+  double expected[] = {1.970288,  1.242498,  6.081864, -0.057255};
+
+  SolveUpperTriangularTransposeInPlace<int>(cols.size() - 1,
+                                            &rows[0],
+                                            &cols[0],
+                                            &values[0],
+                                            rhs_and_solution);
+
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_NEAR(rhs_and_solution[i], expected[i], 1e-4) << i;
+  }
+}
+
+TEST_F(SolveUpperTriangularTest, RTRSolveWithSparseRHS) {
+  double solution[4];
+  double expected[] = { 6.8420e+00,   1.0057e+00,  -1.4907e-16,  -1.9335e+00,
+                        1.0057e+00,   2.2275e+00,  -1.9493e+00,  -6.5693e-01,
+                       -1.4907e-16,  -1.9493e+00,   1.1111e+01,   9.7381e-17,
+                       -1.9335e+00,  -6.5693e-01,   9.7381e-17,   1.2631e+00 };
+
+  for (int i = 0; i < 4; ++i) {
+    SolveRTRWithSparseRHS<int>(cols.size() - 1,
+                               &rows[0],
+                               &cols[0],
+                               &values[0],
+                               i,
+                               solution);
+    for (int j = 0; j < 4; ++j) {
+      EXPECT_NEAR(solution[j], expected[4 * i + j], 1e-3) << i;
+    }
+  }
+}
+
 }  // namespace internal
 }  // namespace ceres
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc
index 61feb6b..19d545c 100644
--- a/internal/ceres/covariance_impl.cc
+++ b/internal/ceres/covariance_impl.cc
@@ -38,6 +38,7 @@
 #include <utility>
 #include <vector>
 #include "Eigen/SVD"
+#include "ceres/compressed_col_sparse_matrix_utils.h"
 #include "ceres/compressed_row_sparse_matrix.h"
 #include "ceres/covariance.h"
 #include "ceres/crs_matrix.h"
@@ -55,6 +56,7 @@
 
 // Per thread storage for SuiteSparse.
 #ifndef CERES_NO_SUITESPARSE
+
 struct PerThreadContext {
   explicit PerThreadContext(int num_rows)
       : solution(NULL),
@@ -80,6 +82,7 @@
   cholmod_dense* rhs;
   SuiteSparse ss;
 };
+
 #endif
 
 }  // namespace
@@ -605,7 +608,6 @@
 
   vector<SuiteSparse_long> transpose_rows(num_cols + 1, 0);
   vector<SuiteSparse_long> transpose_cols(num_nonzeros, 0);
-
   vector<double> transpose_values(num_nonzeros, 0);
 
   for (int idx = 0; idx < num_nonzeros; ++idx) {
@@ -650,23 +652,49 @@
   cholmod_common cc;
   cholmod_l_start(&cc);
 
-  SuiteSparseQR_factorization<double>* factor =
-      SuiteSparseQR_factorize<double>(SPQR_ORDERING_BESTAMD,
-                                      SPQR_DEFAULT_TOL,
-                                      &cholmod_jacobian,
-                                      &cc);
-  event_logger.AddEvent("Numeric Factorization");
+  cholmod_sparse* R = NULL;
+  SuiteSparse_long* permutation = NULL;
 
-  const int rank = cc.SPQR_istat[4];
+  // Compute a Q-less QR factorization of the Jacobian. Since we are
+  // only interested in inverting J'J = R'R, we do not need Q. This
+  // saves memory and gives us R as a permuted compressed column
+  // sparse matrix.
+  //
+  // TODO(sameeragarwal): Currently the symbolic factorization and the
+  // numeric factorization is done at the same time, and this does not
+  // explicitly account for the block column and row structure in the
+  // matrix. When using AMD, we have observed in the past that
+  // computing the ordering with the block matrix is significantly
+  // more efficient, both in runtime as well as the quality of
+  // ordering computed. So, it maybe worth doing that analysis
+  // separately.
+  const SuiteSparse_long rank =
+      SuiteSparseQR<double>(SPQR_ORDERING_BESTAMD,
+                            SPQR_DEFAULT_TOL,
+                            cholmod_jacobian.ncol,
+                            &cholmod_jacobian,
+                            &R,
+                            &permutation,
+                            &cc);
+  event_logger.AddEvent("Numeric Factorization");
+  CHECK_NOTNULL(permutation);
+  CHECK_NOTNULL(R);
+
   if (rank < cholmod_jacobian.ncol) {
     LOG(WARNING) << "Jacobian matrix is rank deficient."
                  << "Number of columns: " << cholmod_jacobian.ncol
                  << " rank: " << rank;
-    SuiteSparseQR_free(&factor, &cc);
+    delete []permutation;
+    cholmod_l_free_sparse(&R, &cc);
     cholmod_l_finish(&cc);
     return false;
   }
 
+  vector<int> inverse_permutation(num_cols);
+  for (SuiteSparse_long i = 0; i < num_cols; ++i) {
+    inverse_permutation[permutation[i]] = i;
+  }
+
   const int* rows = covariance_matrix_->rows();
   const int* cols = covariance_matrix_->cols();
   double* values = covariance_matrix_->mutable_values();
@@ -680,35 +708,39 @@
   //
   // Since the covariance matrix is symmetric, the i^th row and column
   // are equal.
+  const int num_threads = options_.num_threads;
+  scoped_array<double> workspace(new double[num_threads * num_cols]);
 
-  cholmod_dense* rhs = cholmod_l_zeros(num_cols, 1, CHOLMOD_REAL, &cc);
-  double* rhs_x = reinterpret_cast<double*>(rhs->x);
-
+#pragma omp parallel for num_threads(num_threads) schedule(dynamic)
   for (int r = 0; r < num_cols; ++r) {
-    int row_begin = rows[r];
-    int row_end = rows[r + 1];
+    const int row_begin = rows[r];
+    const int row_end = rows[r + 1];
     if (row_end == row_begin) {
       continue;
     }
 
-    rhs_x[r] = 1.0;
+#  ifdef CERES_USE_OPENMP
+    int thread_id = omp_get_thread_num();
+#  else
+    int thread_id = 0;
+#  endif
 
-    cholmod_dense* y1 = SuiteSparseQR_solve<double>(SPQR_RTX_EQUALS_ETB, factor, rhs, &cc);
-    cholmod_dense* solution = SuiteSparseQR_solve<double>(SPQR_RETX_EQUALS_B, factor, y1, &cc);
-
-    double* solution_x = reinterpret_cast<double*>(solution->x);
+    double* solution = workspace.get() + thread_id * num_cols;
+    SolveRTRWithSparseRHS<SuiteSparse_long>(
+        num_cols,
+        static_cast<SuiteSparse_long*>(R->i),
+        static_cast<SuiteSparse_long*>(R->p),
+        static_cast<double*>(R->x),
+        inverse_permutation[r],
+        solution);
     for (int idx = row_begin; idx < row_end; ++idx) {
-      const int c = cols[idx];
-      values[idx] = solution_x[c];
+     const int c = cols[idx];
+     values[idx] = solution[inverse_permutation[c]];
     }
-
-    cholmod_l_free_dense(&y1, &cc);
-    cholmod_l_free_dense(&solution, &cc);
-    rhs_x[r] = 0.0;
   }
 
-  cholmod_l_free_dense(&rhs, &cc);
-  SuiteSparseQR_free(&factor, &cc);
+  delete []permutation;
+  cholmod_l_free_sparse(&R, &cc);
   cholmod_l_finish(&cc);
   event_logger.AddEvent("Inversion");
   return true;
diff --git a/internal/ceres/covariance_test.cc b/internal/ceres/covariance_test.cc
index e7d25a1..f3a5051 100644
--- a/internal/ceres/covariance_test.cc
+++ b/internal/ceres/covariance_test.cc
@@ -499,6 +499,9 @@
 #ifndef CERES_NO_SUITESPARSE
   options.algorithm_type = SPARSE_CHOLESKY;
   ComputeAndCompareCovarianceBlocks(options, expected_covariance);
+
+  options.algorithm_type = SPARSE_QR;
+  ComputeAndCompareCovarianceBlocks(options, expected_covariance);
 #endif
 
   options.algorithm_type = DENSE_SVD;
@@ -552,6 +555,9 @@
 #ifndef CERES_NO_SUITESPARSE
   options.algorithm_type = SPARSE_CHOLESKY;
   ComputeAndCompareCovarianceBlocks(options, expected_covariance);
+
+  options.algorithm_type = SPARSE_QR;
+  ComputeAndCompareCovarianceBlocks(options, expected_covariance);
 #endif
 
   options.algorithm_type = DENSE_SVD;
@@ -776,6 +782,7 @@
 
 TEST_F(LargeScaleCovarianceTest, Parallel) {
   ComputeAndCompare(SPARSE_CHOLESKY, 4);
+  ComputeAndCompare(SPARSE_QR, 4);
 }
 
 #endif  // !defined(CERES_NO_SUITESPARSE) && defined(CERES_USE_OPENMP)