Let ITERATIVE_SCHUR use an explicit Schur Complement matrix.

Up till now ITERATIVE_SCHUR evaluates matrix-vector products
between the Schur complement and a vector implicitly by exploiting
the algebraic expression for the Schur complement.

This cost of this evaluation scales with the number of non-zeros
in the Jacobian.

For small to medium sized problems there is a sweet spot where
computing the Schur complement is cheap enough that it is much
more efficient to explicitly compute it and use it for evaluating
the matrix-vector products.

This changes implements support for an explicit Schur complement
in ITERATIVE_SCHUR in combination with the SCHUR_JACOBI preconditioner.

API wise a new bool Solver::Options::use_explicit_schur_complement
has been added.

The implementation extends the SparseSchurComplementSolver to use
Conjugate Gradients.

Example speedup:

use_explicit_schur_complement = false

Time (in seconds):
Preprocessor                            0.585

  Residual evaluation                   0.319
  Jacobian evaluation                   1.590
  Linear solver                        25.685
Minimizer                              27.990

Postprocessor                           0.010
Total                                  28.585

use_explicit_schur_complement = true

Time (in seconds):
Preprocessor                            0.638

  Residual evaluation                   0.318
  Jacobian evaluation                   1.507
  Linear solver                         5.930
Minimizer                               8.144

Postprocessor                           0.010
Total                                   8.791

Which indicates an end-to-end speedup of more than 3x, with the linear
solver being sped up by > 4x.

The idea to explore this optimization was inspired by the recent paper:

Mining structure fragments for smart bundle adjustment
L. Carlone, P. Alcantarilla, H. Chiu, K. Zsolt, F. Dellaert
British Machine Vision Conference, 2014

which uses a more complicated algorithm to compute parts of the
Schur complement to speed up the matrix-vector product.

Change-Id: I95324af0ab351faa1600f5204039a1d2a64ae61d
diff --git a/examples/bundle_adjuster.cc b/examples/bundle_adjuster.cc
index 296611f..7787d93 100644
--- a/examples/bundle_adjuster.cc
+++ b/examples/bundle_adjuster.cc
@@ -79,6 +79,8 @@
 DEFINE_string(linear_solver, "sparse_schur", "Options are: "
               "sparse_schur, dense_schur, iterative_schur, sparse_normal_cholesky, "
               "dense_qr, dense_normal_cholesky and cgnr.");
+DEFINE_bool(explicit_schur_complement, false, "If using ITERATIVE_SCHUR "
+            "then explicitly compute the Schur complement.");
 DEFINE_string(preconditioner, "jacobi", "Options are: "
               "identity, jacobi, schur_jacobi, cluster_jacobi, "
               "cluster_tridiagonal.");
@@ -136,6 +138,7 @@
             FLAGS_dense_linear_algebra_library,
             &options->dense_linear_algebra_library_type));
   options->num_linear_solver_threads = FLAGS_num_threads;
+  options->use_explicit_schur_complement = FLAGS_explicit_schur_complement;
 }
 
 void SetOrdering(BALProblem* bal_problem, Solver::Options* options) {
diff --git a/include/ceres/solver.h b/include/ceres/solver.h
index 0af34ca..a5efa2a 100644
--- a/include/ceres/solver.h
+++ b/include/ceres/solver.h
@@ -118,6 +118,7 @@
 #endif
 
       num_linear_solver_threads = 1;
+      use_explicit_schur_complement = false;
       use_postordering = false;
       dynamic_sparsity = false;
       min_linear_solver_iterations = 0;
@@ -496,6 +497,29 @@
     // smaller than the group containing cameras.
     shared_ptr<ParameterBlockOrdering> linear_solver_ordering;
 
+    // Use an explicitly computed Schur complement matrix with
+    // ITERATIVE_SCHUR.
+    //
+    // By default this option is disabled and ITERATIVE_SCHUR
+    // evaluates evaluates matrix-vector products between the Schur
+    // complement and a vector implicitly by exploiting the algebraic
+    // expression for the Schur complement.
+    //
+    // The cost of this evaluation scales with the number of non-zeros
+    // in the Jacobian.
+    //
+    // For small to medium sized problems there is a sweet spot where
+    // computing the Schur complement is cheap enough that it is much
+    // more efficient to explicitly compute it and use it for evaluating
+    // the matrix-vector products.
+    //
+    // Enabling this option tells ITERATIVE_SCHUR to use an explicitly
+    // computed Schur complement.
+    //
+    // NOTE: This option can only be used with the SCHUR_JACOBI
+    // preconditioner.
+    bool use_explicit_schur_complement;
+
     // Sparse Cholesky factorization algorithms use a fill-reducing
     // ordering to permute the columns of the Jacobian matrix. There
     // are two ways of doing this.
diff --git a/internal/ceres/linear_solver.cc b/internal/ceres/linear_solver.cc
index d905ec2..e479b75 100644
--- a/internal/ceres/linear_solver.cc
+++ b/internal/ceres/linear_solver.cc
@@ -96,7 +96,11 @@
       return new DenseSchurComplementSolver(options);
 
     case ITERATIVE_SCHUR:
-      return new IterativeSchurComplementSolver(options);
+      if (options.use_explicit_schur_complement) {
+        return new SparseSchurComplementSolver(options);
+      } else {
+        return new IterativeSchurComplementSolver(options);
+      }
 
     case DENSE_QR:
       return new DenseQRSolver(options);
diff --git a/internal/ceres/linear_solver.h b/internal/ceres/linear_solver.h
index 58b9044..5f59765 100644
--- a/internal/ceres/linear_solver.h
+++ b/internal/ceres/linear_solver.h
@@ -99,6 +99,7 @@
           sparse_linear_algebra_library_type(SUITE_SPARSE),
           use_postordering(false),
           dynamic_sparsity(false),
+          use_explicit_schur_complement(false),
           min_num_iterations(1),
           max_num_iterations(1),
           num_threads(1),
@@ -114,9 +115,10 @@
     DenseLinearAlgebraLibraryType dense_linear_algebra_library_type;
     SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type;
 
-    // See solver.h for information about this flag.
+    // See solver.h for information about these flags.
     bool use_postordering;
     bool dynamic_sparsity;
+    bool use_explicit_schur_complement;
 
     // Number of internal iterations that the solver uses. This
     // parameter only makes sense for iterative solvers like CG.
diff --git a/internal/ceres/schur_complement_solver.cc b/internal/ceres/schur_complement_solver.cc
index d2aa168..33f812b 100644
--- a/internal/ceres/schur_complement_solver.cc
+++ b/internal/ceres/schur_complement_solver.cc
@@ -40,6 +40,7 @@
 #include "ceres/block_random_access_sparse_matrix.h"
 #include "ceres/block_sparse_matrix.h"
 #include "ceres/block_structure.h"
+#include "ceres/conjugate_gradients_solver.h"
 #include "ceres/cxsparse.h"
 #include "ceres/detect_structure.h"
 #include "ceres/internal/eigen.h"
@@ -56,6 +57,61 @@
 
 namespace ceres {
 namespace internal {
+namespace {
+
+class BlockRandomAccessSparseMatrixAdapter : public LinearOperator {
+  public:
+  explicit BlockRandomAccessSparseMatrixAdapter(
+      const BlockRandomAccessSparseMatrix& m)
+      : m_(m) {
+  }
+
+  virtual ~BlockRandomAccessSparseMatrixAdapter() {}
+
+  // y = y + Ax;
+  virtual void RightMultiply(const double* x, double* y) const {
+    m_.SymmetricRightMultiply(x, y);
+  }
+
+  // y = y + A'x;
+  virtual void LeftMultiply(const double* x, double* y) const {
+    m_.SymmetricRightMultiply(x, y);
+  }
+
+  virtual int num_rows() const { return m_.num_rows(); }
+  virtual int num_cols() const { return m_.num_rows(); }
+
+ private:
+  const BlockRandomAccessSparseMatrix& m_;
+};
+
+class BlockRandomAccessDiagonalMatrixAdapter : public LinearOperator {
+  public:
+  explicit BlockRandomAccessDiagonalMatrixAdapter(
+      const BlockRandomAccessDiagonalMatrix& m)
+      : m_(m) {
+  }
+
+  virtual ~BlockRandomAccessDiagonalMatrixAdapter() {}
+
+  // y = y + Ax;
+  virtual void RightMultiply(const double* x, double* y) const {
+    m_.RightMultiply(x, y);
+  }
+
+  // y = y + A'x;
+  virtual void LeftMultiply(const double* x, double* y) const {
+    m_.RightMultiply(x, y);
+  }
+
+  virtual int num_rows() const { return m_.num_rows(); }
+  virtual int num_cols() const { return m_.num_rows(); }
+
+ private:
+  const BlockRandomAccessDiagonalMatrix& m_;
+};
+
+} // namespace
 
 LinearSolver::Summary SchurComplementSolver::SolveImpl(
     BlockSparseMatrix* A,
@@ -82,7 +138,7 @@
 
   double* reduced_solution = x + A->num_cols() - lhs_->num_cols();
   const LinearSolver::Summary summary =
-      SolveReducedLinearSystem(reduced_solution);
+      SolveReducedLinearSystem(per_solve_options, reduced_solution);
   event_logger.AddEvent("ReducedSolve");
 
   if (summary.termination_type == LINEAR_SOLVER_SUCCESS) {
@@ -115,7 +171,9 @@
 // BlockRandomAccessDenseMatrix. The linear system is solved using
 // Eigen's Cholesky factorization.
 LinearSolver::Summary
-DenseSchurComplementSolver::SolveReducedLinearSystem(double* solution) {
+DenseSchurComplementSolver::SolveReducedLinearSystem(
+    const LinearSolver::PerSolveOptions& per_solve_options,
+    double* solution) {
   LinearSolver::Summary summary;
   summary.num_iterations = 0;
   summary.termination_type = LINEAR_SOLVER_SUCCESS;
@@ -249,14 +307,25 @@
 }
 
 LinearSolver::Summary
-SparseSchurComplementSolver::SolveReducedLinearSystem(double* solution) {
+SparseSchurComplementSolver::SolveReducedLinearSystem(
+          const LinearSolver::PerSolveOptions& per_solve_options,
+          double* solution) {
+  if (options().type == ITERATIVE_SCHUR) {
+    CHECK(options().use_explicit_schur_complement);
+    return SolveReducedLinearSystemUsingConjugateGradients(per_solve_options,
+                                                           solution);
+  }
+
   switch (options().sparse_linear_algebra_library_type) {
     case SUITE_SPARSE:
-      return SolveReducedLinearSystemUsingSuiteSparse(solution);
+      return SolveReducedLinearSystemUsingSuiteSparse(per_solve_options,
+                                                      solution);
     case CX_SPARSE:
-      return SolveReducedLinearSystemUsingCXSparse(solution);
+      return SolveReducedLinearSystemUsingCXSparse(per_solve_options,
+                                                   solution);
     case EIGEN_SPARSE:
-      return SolveReducedLinearSystemUsingEigen(solution);
+      return SolveReducedLinearSystemUsingEigen(per_solve_options,
+                                                solution);
     default:
       LOG(FATAL) << "Unknown sparse linear algebra library : "
                  << options().sparse_linear_algebra_library_type;
@@ -270,6 +339,7 @@
 // CHOLMOD's sparse cholesky factorization routines.
 LinearSolver::Summary
 SparseSchurComplementSolver::SolveReducedLinearSystemUsingSuiteSparse(
+    const LinearSolver::PerSolveOptions& per_solve_options,
     double* solution) {
 #ifdef CERES_NO_SUITESPARSE
 
@@ -377,6 +447,7 @@
 // CXSparse's sparse cholesky factorization routines.
 LinearSolver::Summary
 SparseSchurComplementSolver::SolveReducedLinearSystemUsingCXSparse(
+    const LinearSolver::PerSolveOptions& per_solve_options,
     double* solution) {
 #ifdef CERES_NO_CXSPARSE
 
@@ -433,6 +504,7 @@
 // Eigen's sparse cholesky factorization routines.
 LinearSolver::Summary
 SparseSchurComplementSolver::SolveReducedLinearSystemUsingEigen(
+    const LinearSolver::PerSolveOptions& per_solve_options,
     double* solution) {
 #ifndef CERES_USE_EIGEN_SPARSE
 
@@ -514,5 +586,79 @@
 #endif  // CERES_USE_EIGEN_SPARSE
 }
 
+LinearSolver::Summary
+SparseSchurComplementSolver::SolveReducedLinearSystemUsingConjugateGradients(
+    const LinearSolver::PerSolveOptions& per_solve_options,
+    double* solution) {
+  const int num_rows = lhs()->num_rows();
+  // The case where there are no f blocks, and the system is block
+  // diagonal.
+  if (num_rows == 0) {
+    LinearSolver::Summary summary;
+    summary.num_iterations = 0;
+    summary.termination_type = LINEAR_SOLVER_SUCCESS;
+    summary.message = "Success.";
+    return summary;
+  }
+
+  // Only SCHUR_JACOBI is supported over here right now.
+  CHECK_EQ(options().preconditioner_type, SCHUR_JACOBI);
+
+  if (preconditioner_.get() == NULL) {
+    preconditioner_.reset(new BlockRandomAccessDiagonalMatrix(blocks_));
+  }
+
+  BlockRandomAccessSparseMatrix* sc =
+      down_cast<BlockRandomAccessSparseMatrix*>(
+          const_cast<BlockRandomAccessMatrix*>(lhs()));
+
+  // Extract block diagonal from the Schur complement to construct the
+  // schur_jacobi preconditioner.
+  for (int i = 0; i  < blocks_.size(); ++i) {
+    const int block_size = blocks_[i];
+
+    int sc_r, sc_c, sc_row_stride, sc_col_stride;
+    CellInfo* sc_cell_info =
+        CHECK_NOTNULL(sc->GetCell(i, i,
+                                  &sc_r, &sc_c,
+                                  &sc_row_stride, &sc_col_stride));
+    MatrixRef sc_m(sc_cell_info->values, sc_row_stride, sc_col_stride);
+
+    int pre_r, pre_c, pre_row_stride, pre_col_stride;
+    CellInfo* pre_cell_info = CHECK_NOTNULL(
+        preconditioner_->GetCell(i, i,
+                                 &pre_r, &pre_c,
+                                 &pre_row_stride, &pre_col_stride));
+    MatrixRef pre_m(pre_cell_info->values, pre_row_stride, pre_col_stride);
+
+    pre_m.block(pre_r, pre_c, block_size, block_size) =
+        sc_m.block(sc_r, sc_c, block_size, block_size);
+  }
+  preconditioner_->Invert();
+
+  VectorRef(solution, num_rows).setZero();
+
+  scoped_ptr<LinearOperator> lhs_adapter(
+      new BlockRandomAccessSparseMatrixAdapter(*sc));
+  scoped_ptr<LinearOperator> preconditioner_adapter(
+      new BlockRandomAccessDiagonalMatrixAdapter(*preconditioner_));
+
+
+  LinearSolver::Options cg_options;
+  cg_options.min_num_iterations = options().min_num_iterations;
+  cg_options.max_num_iterations = options().max_num_iterations;
+  ConjugateGradientsSolver cg_solver(cg_options);
+
+  LinearSolver::PerSolveOptions cg_per_solve_options;
+  cg_per_solve_options.r_tolerance = per_solve_options.r_tolerance;
+  cg_per_solve_options.q_tolerance = per_solve_options.q_tolerance;
+  cg_per_solve_options.preconditioner = preconditioner_adapter.get();
+
+  return cg_solver.Solve(lhs_adapter.get(),
+                         rhs(),
+                         cg_per_solve_options,
+                         solution);
+}
+
 }  // namespace internal
 }  // namespace ceres
diff --git a/internal/ceres/schur_complement_solver.h b/internal/ceres/schur_complement_solver.h
index 1b431dc..93d23e3 100644
--- a/internal/ceres/schur_complement_solver.h
+++ b/internal/ceres/schur_complement_solver.h
@@ -46,6 +46,7 @@
 #include "ceres/suitesparse.h"
 #include "ceres/internal/scoped_ptr.h"
 #include "ceres/types.h"
+#include "ceres/block_random_access_diagonal_matrix.h"
 
 #ifdef CERES_USE_EIGEN_SPARSE
 #include "Eigen/SparseCholesky"
@@ -134,6 +135,7 @@
  private:
   virtual void InitStorage(const CompressedRowBlockStructure* bs) = 0;
   virtual LinearSolver::Summary SolveReducedLinearSystem(
+      const LinearSolver::PerSolveOptions& per_solve_options,
       double* solution) = 0;
 
   LinearSolver::Options options_;
@@ -155,6 +157,7 @@
  private:
   virtual void InitStorage(const CompressedRowBlockStructure* bs);
   virtual LinearSolver::Summary SolveReducedLinearSystem(
+      const LinearSolver::PerSolveOptions& per_solve_options,
       double* solution);
 
   CERES_DISALLOW_COPY_AND_ASSIGN(DenseSchurComplementSolver);
@@ -169,12 +172,19 @@
  private:
   virtual void InitStorage(const CompressedRowBlockStructure* bs);
   virtual LinearSolver::Summary SolveReducedLinearSystem(
+      const LinearSolver::PerSolveOptions& per_solve_options,
       double* solution);
   LinearSolver::Summary SolveReducedLinearSystemUsingSuiteSparse(
+      const LinearSolver::PerSolveOptions& per_solve_options,
       double* solution);
   LinearSolver::Summary SolveReducedLinearSystemUsingCXSparse(
+      const LinearSolver::PerSolveOptions& per_solve_options,
       double* solution);
   LinearSolver::Summary SolveReducedLinearSystemUsingEigen(
+      const LinearSolver::PerSolveOptions& per_solve_options,
+      double* solution);
+  LinearSolver::Summary SolveReducedLinearSystemUsingConjugateGradients(
+      const LinearSolver::PerSolveOptions& per_solve_options,
       double* solution);
 
   // Size of the blocks in the Schur complement.
@@ -206,6 +216,7 @@
   scoped_ptr<SimplicialLDLT> simplicial_ldlt_;
 #endif
 
+  scoped_ptr<BlockRandomAccessDiagonalMatrix> preconditioner_;
   CERES_DISALLOW_COPY_AND_ASSIGN(SparseSchurComplementSolver);
 };
 
diff --git a/internal/ceres/solver.cc b/internal/ceres/solver.cc
index f90045b..e1c5ee3 100644
--- a/internal/ceres/solver.cc
+++ b/internal/ceres/solver.cc
@@ -120,6 +120,14 @@
     OPTION_GT(max_consecutive_nonmonotonic_steps, 0);
   }
 
+  if (options.linear_solver_type == ITERATIVE_SCHUR &&
+      options.use_explicit_schur_complement &&
+      options.preconditioner_type != SCHUR_JACOBI) {
+    *error =  "use_explicit_schur_complement only supports"
+        "SCHUR_JACOBI as the preconditioner.";
+    return false;
+  }
+
   if (options.preconditioner_type == CLUSTER_JACOBI &&
       options.sparse_linear_algebra_library_type != SUITE_SPARSE) {
     *error =  "CLUSTER_JACOBI requires "
diff --git a/internal/ceres/trust_region_preprocessor.cc b/internal/ceres/trust_region_preprocessor.cc
index ba3baa1..22ea1ec 100644
--- a/internal/ceres/trust_region_preprocessor.cc
+++ b/internal/ceres/trust_region_preprocessor.cc
@@ -184,6 +184,8 @@
       options.sparse_linear_algebra_library_type;
   pp->linear_solver_options.dense_linear_algebra_library_type =
       options.dense_linear_algebra_library_type;
+  pp->linear_solver_options.use_explicit_schur_complement =
+      options.use_explicit_schur_complement;
   pp->linear_solver_options.dynamic_sparsity = options.dynamic_sparsity;
   pp->linear_solver_options.num_threads = options.num_linear_solver_threads;