Multithread covariance estimation.

1. Multithread the inversion of J'J.
2. Simplify the dense rank truncation loop.
3. Minor correction to building documentation.

Change-Id: Ide932811c0f28dc6c253809339fb2caa083865b5
diff --git a/docs/source/building.rst b/docs/source/building.rst
index 17e17d4..b91027b 100644
--- a/docs/source/building.rst
+++ b/docs/source/building.rst
@@ -72,7 +72,7 @@
 .. code-block:: bash
 
      # CMake
-     sudo apt-hey install cmake
+     sudo apt-get install cmake
      # gflags
      tar -xvzf gflags-2.0.tar.gz
      cd gflags-2.0
diff --git a/include/ceres/covariance.h b/include/ceres/covariance.h
index f65d9eb..b093026 100644
--- a/include/ceres/covariance.h
+++ b/include/ceres/covariance.h
@@ -51,7 +51,7 @@
 // 1. This is experimental code and the API WILL CHANGE before
 //    release.
 //
-// 2. WARNING: It is very easy to use this class incorrectly without
+// 2. It is very easy to use this class incorrectly without
 //    understanding the underlying mathematics. Please read and
 //    understand the documentation completely before attempting to use
 //    this class.
@@ -166,6 +166,21 @@
 // with indeterminacy. IEEE Transactions on Information Theory 47(5):
 // 2017-2028 (2001)
 //
+// Speed
+// -----
+//
+// When use_dense_linear_algebra = true, Eigen's JacobiSVD algorithm
+// is used to perform the computations. It is an accurate but slow
+// method and should only be used for small to moderate sized
+// problems.
+//
+// When use_dense_linear_algebra = false, SuiteSparse/CHOLMOD is used
+// to perform the computation. Recent versions of SuiteSparse (>=
+// 4.2.0) provide a much more efficient method for solving for rows of
+// the covariance matrix. Therefore, if you are doing large scale
+// covariance estimation, we strongly recommend using a recent version
+// of SuiteSparse.
+//
 // Example Usage
 // =============
 //
@@ -248,6 +263,27 @@
     // and by default Covariance::Compute will return false if it
     // encounters such a matrix.
     //
+    // use_dense_linear_algebra = false
+    // --------------------------------
+    //
+    // When performing large scale sparse covariance estimation,
+    // computing the exact value of the reciprocal condition number is
+    // not possible as it would require computing the eigenvalues of
+    // J'J.
+    //
+    // In this case we use cholmod_rcond, which uses the ratio of the
+    // smallest to the largest diagonal entries of the Cholesky
+    // factorization as an approximation to the reciprocal condition
+    // number.
+    //
+    // However, care must be taken as this is a heuristic and can
+    // sometimes be a very crude estimate. The default value of
+    // min_reciprocal_condition_number has been set to a conservative
+    // value, and sometimes the Covariance::Compute may return false
+    // even if it is possible to estimate the covariance reliably. In
+    // such cases, the user should exercise their judgement before
+    // lowering the value of min_reciprocal_condition_number.
+    //
     // use_dense_linear_algebra = true
     // -------------------------------
     //
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc
index 966ba35..978acc1 100644
--- a/internal/ceres/covariance_impl.cc
+++ b/internal/ceres/covariance_impl.cc
@@ -30,6 +30,10 @@
 
 #include "ceres/covariance_impl.h"
 
+#ifdef CERES_USE_OPENMP
+#include <omp.h>
+#endif
+
 #include <algorithm>
 #include <utility>
 #include <vector>
@@ -47,6 +51,36 @@
 
 namespace ceres {
 namespace internal {
+namespace {
+
+// Per thread storage for SuiteSparse.
+struct PerThreadContext {
+  PerThreadContext(int num_rows)
+      : solution(NULL),
+        solution_set(NULL),
+        y_workspace(NULL),
+        e_workspace(NULL),
+        rhs(NULL) {
+    rhs = ss.CreateDenseVector(NULL, num_rows, num_rows);
+  }
+
+  ~PerThreadContext() {
+    ss.Free(solution);
+    ss.Free(solution_set);
+    ss.Free(y_workspace);
+    ss.Free(e_workspace);
+    ss.Free(rhs);
+  }
+
+  cholmod_dense* solution;
+  cholmod_sparse* solution_set;
+  cholmod_dense* y_workspace;
+  cholmod_dense* e_workspace;
+  cholmod_dense* rhs;
+  SuiteSparse ss;
+};
+
+} // namespace
 
 typedef vector<pair<const double*, const double*> > CovarianceBlocks;
 
@@ -424,9 +458,6 @@
   const int* cols = covariance_matrix_->cols();
   double* values = covariance_matrix_->mutable_values();
 
-  cholmod_dense* rhs = ss_.CreateDenseVector(NULL, num_rows, num_rows);
-  double* rhs_x = reinterpret_cast<double*>(rhs->x);
-
   // The following loop exploits the fact that the i^th column of A^{-1}
   // is given by the solution to the linear system
   //
@@ -441,6 +472,9 @@
   // versions of SuiteSparse have the cholmod_solve2 function which
   // re-uses memory across calls.
 #if (SUITESPARSE_VERSION < 4002)
+  cholmod_dense* rhs = ss_.CreateDenseVector(NULL, num_rows, num_rows);
+  double* rhs_x = reinterpret_cast<double*>(rhs->x);
+
   for (int r = 0; r < num_rows; ++r) {
     int row_begin = rows[r];
     int row_end = rows[r + 1];
@@ -458,15 +492,35 @@
     ss_.Free(solution);
     rhs_x[r] = 0.0;
   }
-#else
-  // TODO(sameeragarwal) There should be a more efficient way
-  // involving the use of Bset but I am unable to make it work right
-  // now.
-  cholmod_dense* solution = NULL;
-  cholmod_sparse* solution_set = NULL;
-  cholmod_dense* y_workspace = NULL;
-  cholmod_dense* e_workspace = NULL;
 
+  ss_.Free(rhs);
+#else  // SUITESPARSE_VERSION < 4002
+
+  const int num_threads = options_.num_threads;
+  vector<PerThreadContext*> contexts(num_threads);
+  for (int i = 0; i < num_threads; ++i) {
+    contexts[i] = new PerThreadContext(num_rows);
+  }
+
+  // The first call to cholmod_solve2 is not thread safe, since it
+  // changes the factorization from supernodal to simplicial etc.
+  {
+    PerThreadContext* context = contexts[0];
+    double* context_rhs_x =  reinterpret_cast<double*>(context->rhs->x);
+    context_rhs_x[0] = 1.0;
+    cholmod_solve2(CHOLMOD_A,
+                   factor,
+                   context->rhs,
+                   NULL,
+                   &context->solution,
+                   &context->solution_set,
+                   &context->y_workspace,
+                   &context->e_workspace,
+                   context->ss.mutable_cc());
+    context_rhs_x[0] = 0.0;
+  }
+
+#pragma omp parallel for num_threads(num_threads) schedule(dynamic)
   for (int r = 0; r < num_rows; ++r) {
     int row_begin = rows[r];
     int row_end = rows[r + 1];
@@ -474,40 +528,52 @@
       continue;
     }
 
-    rhs_x[r] = 1.0;
+#  ifdef CERES_USE_OPENMP
+    int thread_id = omp_get_thread_num();
+#  else
+    int thread_id = 0;
+#  endif
 
+    PerThreadContext* context = contexts[thread_id];
+    double* context_rhs_x =  reinterpret_cast<double*>(context->rhs->x);
+    context_rhs_x[r] = 1.0;
+
+    // TODO(sameeragarwal) There should be a more efficient way
+    // involving the use of Bset but I am unable to make it work right
+    // now.
     cholmod_solve2(CHOLMOD_A,
                    factor,
-                   rhs,
+                   context->rhs,
                    NULL,
-                   &solution,
-                   &solution_set,
-                   &y_workspace,
-                   &e_workspace,
-                   ss_.mutable_cc());
+                   &context->solution,
+                   &context->solution_set,
+                   &context->y_workspace,
+                   &context->e_workspace,
+                   context->ss.mutable_cc());
 
-    double* solution_x = reinterpret_cast<double*>(solution->x);
+    double* solution_x = reinterpret_cast<double*>(context->solution->x);
     for (int idx = row_begin; idx < row_end; ++idx) {
       const int c = cols[idx];
       values[idx] = solution_x[c];
     }
-    rhs_x[r] = 0.0;
+    context_rhs_x[r] = 0.0;
   }
 
-  ss_.Free(solution);
-  ss_.Free(solution_set);
-  ss_.Free(y_workspace);
-  ss_.Free(e_workspace);
+  for (int i = 0; i < num_threads; ++i) {
+    delete contexts[i];
+  }
 
-#endif
+#endif  // SUITESPARSE_VERSION < 4002
 
-  ss_.Free(rhs);
   ss_.Free(factor);
   event_logger.AddEvent("Inversion");
   return true;
-#else
+
+#else  // CERES_NO_SUITESPARSE
+
   return false;
-#endif
+
+#endif  // CERES_NO_SUITESPARSE
 };
 
 bool CovarianceImpl::ComputeCovarianceValuesUsingEigen() {
@@ -549,19 +615,32 @@
   const int max_rank = min(num_singular_values,
                            num_singular_values - options_.null_space_rank);
 
+  // Compute the squared inverse of the singular values. Truncate the
+  // computation based on min_singular_value_ratio and
+  // null_space_rank. When either of these two quantities are active,
+  // the resulting covariance matrix is a Moore-Penrose inverse
+  // instead of a regular inverse.
   for (int i = 0; i < max_rank; ++i) {
     const double singular_value_ratio = singular_values[i] / max_singular_value;
-    if (singular_value_ratio >= min_singular_value_ratio) {
-      inverse_squared_singular_values[i] =
-          1.0 / (singular_values[i] * singular_values[i]);
-    } else if (!automatic_truncation) {
-      LOG(WARNING) << "Cholesky factorization of J'J is not reliable. "
-                   << "Reciprocal condition number: "
-                   << singular_value_ratio * singular_value_ratio << " "
-                   << "min_reciprocal_condition_number : "
-                   << options_.min_reciprocal_condition_number;
-      return false;
+    if (singular_value_ratio < min_singular_value_ratio) {
+      // Since the singular values are in decreasing order, if
+      // automatic truncation is enabled, then from this point on
+      // all values will fail the ratio test and there is nothing to
+      // do in this loop.
+      if (automatic_truncation) {
+        break;
+      } else {
+        LOG(WARNING) << "Cholesky factorization of J'J is not reliable. "
+                     << "Reciprocal condition number: "
+                     << singular_value_ratio * singular_value_ratio << " "
+                     << "min_reciprocal_condition_number : "
+                     << options_.min_reciprocal_condition_number;
+        return false;
+      }
     }
+
+    inverse_squared_singular_values[i] =
+        1.0 / (singular_values[i] * singular_values[i]);
   }
 
   Matrix dense_covariance =
@@ -585,7 +664,5 @@
   return true;
 };
 
-
-
 }  // namespace internal
 }  // namespace ceres
diff --git a/internal/ceres/covariance_test.cc b/internal/ceres/covariance_test.cc
index 05fb234..bd0a0e2 100644
--- a/internal/ceres/covariance_test.cc
+++ b/internal/ceres/covariance_test.cc
@@ -405,6 +405,50 @@
   ComputeAndCompareCovarianceBlocks(options, expected_covariance);
 }
 
+#ifdef CERES_USE_OPENMP
+
+TEST_F(CovarianceTest, ThreadedNormalBehavior) {
+  // J
+  //
+  //   1  0  0  0  0  0
+  //   0  1  0  0  0  0
+  //   0  0  2  0  0  0
+  //   0  0  0  2  0  0
+  //   0  0  0  0  2  0
+  //   0  0  0  0  0  5
+  //  -5 -6  1  2  3  0
+  //   3 -2  0  0  0  2
+
+  // J'J
+  //
+  //   35  24 -5 -10 -15  6
+  //   24  41 -6 -12 -18 -4
+  //   -5  -6  5   2   3  0
+  //  -10 -12  2   8   6  0
+  //  -15 -18  3   6  13  0
+  //    6  -4  0   0   0 29
+
+  // inv(J'J) computed using octave.
+  double expected_covariance[] = {
+     7.0747e-02,  -8.4923e-03,   1.6821e-02,   3.3643e-02,   5.0464e-02,  -1.5809e-02,  // NOLINT
+    -8.4923e-03,   8.1352e-02,   2.4758e-02,   4.9517e-02,   7.4275e-02,   1.2978e-02,  // NOLINT
+     1.6821e-02,   2.4758e-02,   2.4904e-01,  -1.9271e-03,  -2.8906e-03,  -6.5325e-05,  // NOLINT
+     3.3643e-02,   4.9517e-02,  -1.9271e-03,   2.4615e-01,  -5.7813e-03,  -1.3065e-04,  // NOLINT
+     5.0464e-02,   7.4275e-02,  -2.8906e-03,  -5.7813e-03,   2.4133e-01,  -1.9598e-04,  // NOLINT
+    -1.5809e-02,   1.2978e-02,  -6.5325e-05,  -1.3065e-04,  -1.9598e-04,   3.9544e-02,  // NOLINT
+  };
+
+  Covariance::Options options;
+  options.num_threads = 4;
+
+  options.use_dense_linear_algebra = false;
+  ComputeAndCompareCovarianceBlocks(options, expected_covariance);
+
+  options.use_dense_linear_algebra = true;
+  ComputeAndCompareCovarianceBlocks(options, expected_covariance);
+}
+
+#endif  // CERES_USE_OPENMP
 
 TEST_F(CovarianceTest, ConstantParameterBlock) {
   problem_.SetParameterBlockConstant(parameters_);
@@ -644,5 +688,80 @@
   ComputeAndCompareCovarianceBlocks(options, expected_covariance);
 }
 
+class LargeScaleCovarianceTest : public ::testing::Test {
+ protected:
+  virtual void SetUp() {
+    num_parameter_blocks_ = 2000;
+    parameter_block_size_ = 5;
+    parameters_.reset(new double[parameter_block_size_ * num_parameter_blocks_]);
+
+    Matrix jacobian(parameter_block_size_, parameter_block_size_);
+    for (int i = 0; i < num_parameter_blocks_; ++i) {
+      jacobian.setIdentity();
+      jacobian *= (i + 1);
+
+      double* block_i = parameters_.get() + i * parameter_block_size_;
+      problem_.AddResidualBlock(new UnaryCostFunction(parameter_block_size_,
+                                                      parameter_block_size_,
+                                                      jacobian.data()),
+                                NULL,
+                                block_i );
+      for (int j = i; j < num_parameter_blocks_; ++j) {
+        double* block_j = parameters_.get() + j * parameter_block_size_;
+        all_covariance_blocks_.push_back(make_pair(block_i, block_j));
+      }
+    }
+  }
+
+  void ComputeAndCompare(int num_threads) {
+    Covariance::Options options;
+    options.use_dense_linear_algebra = false;
+    options.num_threads = num_threads;
+    Covariance covariance(options);
+    EXPECT_TRUE(covariance.Compute(all_covariance_blocks_, &problem_));
+
+    Matrix expected(parameter_block_size_, parameter_block_size_);
+    Matrix actual(parameter_block_size_, parameter_block_size_);
+    const double kTolerance = 1e-16;
+
+    for (int i = 0; i < num_parameter_blocks_; ++i) {
+      expected.setIdentity();
+      expected /= (i + 1.0) * (i + 1.0);
+
+      double* block_i = parameters_.get() + i * parameter_block_size_;
+      covariance.GetCovarianceBlock(block_i, block_i, actual.data());
+      EXPECT_NEAR((expected - actual).norm(), 0.0, kTolerance)
+          << "block: " << i << ", " << i << "\n"
+          << "expected: \n" << expected << "\n"
+          << "actual: \n" << actual;
+
+      expected.setZero();
+      for (int j = i + 1; j < num_parameter_blocks_; ++j) {
+        double* block_j = parameters_.get() + j * parameter_block_size_;
+        covariance.GetCovarianceBlock(block_i, block_j, actual.data());
+        EXPECT_NEAR((expected - actual).norm(), 0.0, kTolerance)
+            << "block: " << i << ", " << j << "\n"
+            << "expected: \n" << expected << "\n"
+            << "actual: \n" << actual;
+      }
+    }
+  }
+
+  scoped_array<double> parameters_;
+  int parameter_block_size_;
+  int num_parameter_blocks_;
+
+  Problem problem_;
+  vector<pair<const double*, const double*> > all_covariance_blocks_;
+};
+
+#if !defined(CERES_NO_SUITESPARSE) && defined(CERES_USE_OPENMP)
+
+TEST_F(LargeScaleCovarianceTest, Parallel) {
+  ComputeAndCompare(4);
+}
+
+#endif  // !defined(CERES_NO_SUITESPARSE) && defined(CERES_USE_OPENMP)
+
 }  // namespace internal
 }  // namespace ceres