Expose SubsetPreconditioner in the API

https://github.com/ceres-solver/ceres-solver/issues/270

Detailed list of changes:

1. Add SUBSET to the PreconditionerType enum.
2. Add Solver::Options::residual_blocks_for_subset_preconditioner
3. Integrate SubsetPreconditioner into the CGNR solver.
4. Add the reordering logic needed for this to TrustRegionPreprocessor.
5. Expect CreateJacobianBlockTranspose to take the starting row block
   so that we can work with subparts of the Jacobian matrix.
6. Extend the denoising example to use this preconditioner.

As an illustration of its performance, we consider the performance of
denoising -input ../data/ceres_noisy.pgm  --foe_file ../data/5x5.foe

tl;dr

For the same cost,

SPARSE_NORMAL_CHOLESKY -  81s
CGNR + JACOBI          - 718s
CGNR + SUBSET          -  57s

SPARSE_NORMAL_CHOLESKY
======================

Cost:
Initial                          2.317806e+05
Final                            2.232323e+04
Change                           2.094574e+05

Minimizer iterations                       10
Successful steps                           10
Unsuccessful steps                          0

Time (in seconds):
Preprocessor                         2.999746

  Residual only evaluation           2.306811 (10)
  Jacobian & residual evaluation     7.421727 (10)
  Linear solver                     65.517273 (10)
Minimizer                           78.731011

Postprocessor                        0.026079
Total                               81.756836

Termination:                      CONVERGENCE (Function tolerance reached. |cost_change|/cost: 8.573046e-04 <= 1.000000e-03)

CGNR + JACOBI
=============
Cost:
Initial                          2.317806e+05
Final                            2.232344e+04
Change                           2.094572e+05

Minimizer iterations                       10
Successful steps                           10
Unsuccessful steps                          0

Time (in seconds):
Preprocessor                         0.648814

  Residual only evaluation           2.297607 (10)
  Jacobian & residual evaluation     7.327886 (10)
  Linear solver                    699.601248 (10)
Minimizer                          712.419493

Postprocessor                        0.024014
Total                              713.092321

Termination:                      CONVERGENCE (Function tolerance reached. |cost_change|/cost: 8.528538e-04 <= 1.000000e-03)

CGNR + SUBSET (random 20% residuals used for the preconditioner)
===============================================================
Cost:
Initial                          2.317806e+05
Final                            2.232327e+04
Change                           2.094574e+05

Minimizer iterations                       10
Successful steps                           10
Unsuccessful steps                          0

Time (in seconds):
Preprocessor                         1.472743

  Residual only evaluation           2.428315 (10)
  Jacobian & residual evaluation     7.367796 (10)
  Linear solver                     42.585999 (10)
Minimizer                           55.664459

Postprocessor                        0.024098
Total                               57.161301

Termination:                      CONVERGENCE (Function tolerance reached. |cost_change|/cost: 8.538277e-04 <= 1.000000e-03)

Change-Id: Ifb011408bd53edbb9439b0b7345649a38f999e18
diff --git a/docs/source/nnls_solving.rst b/docs/source/nnls_solving.rst
index 3a2ac01..d03e62e 100644
--- a/docs/source/nnls_solving.rst
+++ b/docs/source/nnls_solving.rst
@@ -750,6 +750,9 @@
 stages and is not as mature as the other preconditioners described
 above.
 
+TODO(sameeragarwal): Rewrite this section and add the description of
+the SubsetPreconditioner.
+
 .. _section-ordering:
 
 Ordering
diff --git a/examples/denoising.cc b/examples/denoising.cc
index e64d70f..9b07735 100644
--- a/examples/denoising.cc
+++ b/examples/denoising.cc
@@ -41,24 +41,66 @@
 #include <algorithm>
 #include <cmath>
 #include <iostream>
-#include <vector>
+#include <random>
 #include <sstream>
 #include <string>
+#include <vector>
 
 #include "ceres/ceres.h"
+#include "fields_of_experts.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
-
-#include "fields_of_experts.h"
 #include "pgm_image.h"
 
 DEFINE_string(input, "", "File to which the output image should be written");
 DEFINE_string(foe_file, "", "FoE file to use");
 DEFINE_string(output, "", "File to which the output image should be written");
 DEFINE_double(sigma, 20.0, "Standard deviation of noise");
-DEFINE_bool(verbose, false, "Prints information about the solver progress.");
-DEFINE_bool(line_search, false, "Use a line search instead of trust region "
+DEFINE_string(trust_region_strategy,
+              "levenberg_marquardt",
+              "Options are: levenberg_marquardt, dogleg.");
+DEFINE_string(dogleg,
+              "traditional_dogleg",
+              "Options are: traditional_dogleg,"
+              "subspace_dogleg.");
+DEFINE_string(linear_solver,
+              "sparse_normal_cholesky",
+              "Options are: "
+              "sparse_normal_cholesky and cgnr.");
+DEFINE_string(preconditioner,
+              "jacobi",
+              "Options are: "
+              "identity, jacobi, subset");
+DEFINE_string(sparse_linear_algebra_library,
+              "suite_sparse",
+              "Options are: suite_sparse, cx_sparse and eigen_sparse");
+DEFINE_double(eta,
+              1e-2,
+              "Default value for eta. Eta determines the "
+              "accuracy of each linear solve of the truncated newton step. "
+              "Changing this parameter can affect solve performance.");
+DEFINE_int32(num_threads, 1, "Number of threads.");
+DEFINE_int32(num_iterations, 10, "Number of iterations.");
+DEFINE_bool(nonmonotonic_steps,
+            false,
+            "Trust region algorithm can use"
+            " nonmonotic steps.");
+DEFINE_bool(inner_iterations,
+            false,
+            "Use inner iterations to non-linearly "
+            "refine each successful trust region step.");
+DEFINE_bool(mixed_precision_solves, false, "Use mixed precision solves.");
+DEFINE_int32(max_num_refinement_iterations,
+             0,
+             "Iterative refinement iterations");
+DEFINE_bool(line_search,
+            false,
+            "Use a line search instead of trust region "
             "algorithm.");
+DEFINE_double(subset_fraction,
+              0.2,
+              "The fraction of residual blocks to use for the"
+              " subset preconditioner.");
 
 namespace ceres {
 namespace examples {
@@ -70,8 +112,7 @@
 //
 class QuadraticCostFunction : public ceres::SizedCostFunction<1, 1> {
  public:
-  QuadraticCostFunction(double a, double b)
-    : sqrta_(std::sqrt(a)), b_(b) {}
+  QuadraticCostFunction(double a, double b) : sqrta_(std::sqrt(a)), b_(b) {}
   virtual bool Evaluate(double const* const* parameters,
                         double* residuals,
                         double** jacobians) const {
@@ -82,6 +123,7 @@
     }
     return true;
   }
+
  private:
   double sqrta_, b_;
 };
@@ -94,13 +136,11 @@
   // Create the data term
   CHECK_GT(FLAGS_sigma, 0.0);
   const double coefficient = 1 / (2.0 * FLAGS_sigma * FLAGS_sigma);
-  for (unsigned index = 0; index < image.NumPixels(); ++index) {
-    ceres::CostFunction* cost_function =
-        new QuadraticCostFunction(coefficient,
-                                  image.PixelFromLinearIndex(index));
-    problem->AddResidualBlock(cost_function,
-                              NULL,
-                              solution->MutablePixelFromLinearIndex(index));
+  for (int index = 0; index < image.NumPixels(); ++index) {
+    ceres::CostFunction* cost_function = new QuadraticCostFunction(
+        coefficient, image.PixelFromLinearIndex(index));
+    problem->AddResidualBlock(
+        cost_function, NULL, solution->MutablePixelFromLinearIndex(index));
   }
 
   // Create Ceres cost and loss functions for regularization. One is needed for
@@ -127,14 +167,41 @@
       // For this patch with coordinates (x, y), we will add foe.NumFilters()
       // terms to the objective function.
       for (int alpha_index = 0; alpha_index < foe.NumFilters(); ++alpha_index) {
-        problem->AddResidualBlock(cost_function[alpha_index],
-                                  loss_function[alpha_index],
-                                  pixels);
+        problem->AddResidualBlock(
+            cost_function[alpha_index], loss_function[alpha_index], pixels);
       }
     }
   }
 }
 
+void SetLinearSolver(Solver::Options* options) {
+  CHECK(StringToLinearSolverType(FLAGS_linear_solver,
+                                 &options->linear_solver_type));
+  CHECK(StringToPreconditionerType(FLAGS_preconditioner,
+                                   &options->preconditioner_type));
+  CHECK(StringToSparseLinearAlgebraLibraryType(
+      FLAGS_sparse_linear_algebra_library,
+      &options->sparse_linear_algebra_library_type));
+  options->use_mixed_precision_solves = FLAGS_mixed_precision_solves;
+  options->max_num_refinement_iterations = FLAGS_max_num_refinement_iterations;
+}
+
+void SetMinimizerOptions(Solver::Options* options) {
+  options->max_num_iterations = FLAGS_num_iterations;
+  options->minimizer_progress_to_stdout = true;
+  options->num_threads = FLAGS_num_threads;
+  options->eta = FLAGS_eta;
+  options->use_nonmonotonic_steps = FLAGS_nonmonotonic_steps;
+  if (FLAGS_line_search) {
+    options->minimizer_type = ceres::LINE_SEARCH;
+  }
+
+  CHECK(StringToTrustRegionStrategyType(FLAGS_trust_region_strategy,
+                                        &options->trust_region_strategy_type));
+  CHECK(StringToDoglegType(FLAGS_dogleg, &options->dogleg_type));
+  options->use_inner_iterations = FLAGS_inner_iterations;
+}
+
 // Solves the FoE problem using Ceres and post-processes it to make sure the
 // solution stays within [0, 255].
 void SolveProblem(Problem* problem, PGMImage<double>* solution) {
@@ -142,23 +209,33 @@
   // to be faster for 2x2 filters, but gives solutions with slightly higher
   // objective function value.
   ceres::Solver::Options options;
-  options.max_num_iterations = 100;
-  if (FLAGS_verbose) {
-    options.minimizer_progress_to_stdout = true;
-  }
-
-  if (FLAGS_line_search) {
-    options.minimizer_type = ceres::LINE_SEARCH;
-  }
-
-  options.linear_solver_type = ceres::SPARSE_NORMAL_CHOLESKY;
+  SetMinimizerOptions(&options);
+  SetLinearSolver(&options);
   options.function_tolerance = 1e-3;  // Enough for denoising.
 
+  if (options.linear_solver_type == ceres::CGNR &&
+      options.preconditioner_type == ceres::SUBSET) {
+    std::vector<ResidualBlockId> residual_blocks;
+    problem->GetResidualBlocks(&residual_blocks);
+
+    // To use the SUBSET preconditioner we need to provide a list of
+    // residual blocks (rows of the Jacobian). The denoising problem
+    // has fairly general sparsity, and there is no apriori reason to
+    // select one residual block over another, so we will randomly
+    // subsample the residual blocks with probability subset_fraction.
+    std::default_random_engine engine;
+    std::uniform_real_distribution<> distribution(0, 1);  // rage 0 - 1
+    for (auto residual_block : residual_blocks) {
+      if (distribution(engine) <= FLAGS_subset_fraction) {
+        options.residual_blocks_for_subset_preconditioner.insert(
+            residual_block);
+      }
+    }
+  }
+
   ceres::Solver::Summary summary;
   ceres::Solve(options, problem, &summary);
-  if (FLAGS_verbose) {
-    std::cout << summary.FullReport() << "\n";
-  }
+  std::cout << summary.FullReport() << "\n";
 
   // Make the solution stay in [0, 255].
   for (int x = 0; x < solution->width(); ++x) {
@@ -175,8 +252,8 @@
 
 int main(int argc, char** argv) {
   using namespace ceres::examples;
-  std::string
-      usage("This program denoises an image using Ceres.  Sample usage:\n");
+  std::string usage(
+      "This program denoises an image using Ceres.  Sample usage:\n");
   usage += argv[0];
   usage += " --input=<noisy image PGM file> --foe_file=<FoE file name>";
   CERES_GFLAGS_NAMESPACE::SetUsageMessage(usage);
diff --git a/include/ceres/solver.h b/include/ceres/solver.h
index 1e479ae..ab7e9c2 100644
--- a/include/ceres/solver.h
+++ b/include/ceres/solver.h
@@ -34,8 +34,10 @@
 #include <cmath>
 #include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 #include "ceres/crs_matrix.h"
+#include "ceres/problem.h"
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/port.h"
 #include "ceres/iteration_callback.h"
@@ -44,8 +46,6 @@
 
 namespace ceres {
 
-class Problem;
-
 // Interface for non-linear least squares solvers.
 class CERES_EXPORT Solver {
  public:
@@ -337,6 +337,30 @@
     // preconditioner_type is CLUSTER_JACOBI or CLUSTER_TRIDIAGONAL.
     VisibilityClusteringType visibility_clustering_type = CANONICAL_VIEWS;
 
+    // Subset preconditioner is a general purpose preconditioner for
+    // linear least squares problems. Given a set of residual blocks,
+    // it uses the corresponding subset of the rows of the Jacobian to
+    // construct a preconditioner.
+    //
+    // Suppose the Jacobian J has been horizontally partitioned as
+    //
+    // J = [P]
+    //     [Q]
+    //
+    // Where, Q is the set of rows corresponding to the residual
+    // blocks in residual_blocks_for_subset_preconditioner.
+    //
+    // The preconditioner is the inverse of the matrix Q'Q.
+    //
+    // Obviously, the efficacy of the preconditioner depends on how
+    // well the matrix Q approximates J'J, or how well the chosen
+    // residual blocks approximate the non-linear least squares
+    // problem.
+    //
+    // If Solver::Options::preconditioner_type == SUBSET, then
+    // residual_blocks_for_subset_preconditioner must be non-empty.
+    std::unordered_set<ResidualBlockId> residual_blocks_for_subset_preconditioner;
+
     // Ceres supports using multiple dense linear algebra libraries
     // for dense matrix factorizations. Currently EIGEN and LAPACK are
     // the valid choices. EIGEN is always available, LAPACK refers to
diff --git a/include/ceres/types.h b/include/ceres/types.h
index fec3cea..ed448b2 100644
--- a/include/ceres/types.h
+++ b/include/ceres/types.h
@@ -112,10 +112,29 @@
   // the scene to determine the sparsity structure of the
   // preconditioner. This is done using a clustering algorithm. The
   // available visibility clustering algorithms are described below.
-  //
-  // Note: Requires SuiteSparse.
   CLUSTER_JACOBI,
-  CLUSTER_TRIDIAGONAL
+  CLUSTER_TRIDIAGONAL,
+
+  // Subset preconditioner is a general purpose preconditioner
+  // linear least squares problems. Given a set of residual blocks,
+  // it uses the corresponding subset of the rows of the Jacobian to
+  // construct a preconditioner.
+  //
+  // Suppose the Jacobian J has been horizontally partitioned as
+  //
+  // J = [P]
+  //     [Q]
+  //
+  // Where, Q is the set of rows corresponding to the residual
+  // blocks in residual_blocks_for_subset_preconditioner.
+  //
+  // The preconditioner is the inverse of the matrix Q'Q.
+  //
+  // Obviously, the efficacy of the preconditioner depends on how
+  // well the matrix Q approximates J'J, or how well the chosen
+  // residual blocks approximate the non-linear least squares
+  // problem.
+  SUBSET,
 };
 
 enum VisibilityClusteringType {
diff --git a/internal/ceres/cgnr_solver.cc b/internal/ceres/cgnr_solver.cc
index 463fbbd..9dba1cf 100644
--- a/internal/ceres/cgnr_solver.cc
+++ b/internal/ceres/cgnr_solver.cc
@@ -35,6 +35,7 @@
 #include "ceres/conjugate_gradients_solver.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/linear_solver.h"
+#include "ceres/subset_preconditioner.h"
 #include "ceres/wall_time.h"
 #include "glog/logging.h"
 
@@ -42,10 +43,14 @@
 namespace internal {
 
 CgnrSolver::CgnrSolver(const LinearSolver::Options& options)
-  : options_(options) {
+    : options_(options) {
   if (options_.preconditioner_type != JACOBI &&
-      options_.preconditioner_type != IDENTITY) {
-    LOG(FATAL) << "CGNR only supports IDENTITY and JACOBI preconditioners.";
+      options_.preconditioner_type != IDENTITY &&
+      options_.preconditioner_type != SUBSET) {
+    LOG(FATAL)
+        << "Preconditioner = "
+        << PreconditionerTypeToString(options_.preconditioner_type) << ". "
+        << "Congratulations, you found a bug in Ceres. Please report it.";
   }
 }
 
@@ -63,16 +68,31 @@
   z.setZero();
   A->LeftMultiply(b, z.data());
 
-  // Precondition if necessary.
-  LinearSolver::PerSolveOptions cg_per_solve_options = per_solve_options;
-  if (options_.preconditioner_type == JACOBI) {
-    if (preconditioner_.get() == NULL) {
+  if (!preconditioner_) {
+    if (options_.preconditioner_type == JACOBI) {
       preconditioner_.reset(new BlockJacobiPreconditioner(*A));
+    } else if (options_.preconditioner_type == SUBSET) {
+      Preconditioner::Options preconditioner_options;
+      preconditioner_options.type = SUBSET;
+      preconditioner_options.subset_preconditioner_start_row_block =
+          options_.subset_preconditioner_start_row_block;
+      preconditioner_options.sparse_linear_algebra_library_type =
+          options_.sparse_linear_algebra_library_type;
+      preconditioner_options.use_postordering = options_.use_postordering;
+      preconditioner_options.num_threads = options_.num_threads;
+      preconditioner_options.context = options_.context;
+      preconditioner_.reset(
+          new SubsetPreconditioner(preconditioner_options, *A));
     }
-    preconditioner_->Update(*A, per_solve_options.D);
-    cg_per_solve_options.preconditioner = preconditioner_.get();
   }
 
+  if (preconditioner_) {
+    preconditioner_->Update(*A, per_solve_options.D);
+  }
+
+  LinearSolver::PerSolveOptions cg_per_solve_options = per_solve_options;
+  cg_per_solve_options.preconditioner = preconditioner_.get();
+
   // Solve (AtA + DtD)x = z (= Atb).
   VectorRef(x, A->num_cols()).setZero();
   CgnrLinearOperator lhs(*A, per_solve_options.D);
diff --git a/internal/ceres/linear_solver.h b/internal/ceres/linear_solver.h
index 24c245d..cb624b3 100644
--- a/internal/ceres/linear_solver.h
+++ b/internal/ceres/linear_solver.h
@@ -162,6 +162,7 @@
 
     bool use_mixed_precision_solves = false;
     int max_num_refinement_iterations = 0;
+    int subset_preconditioner_start_row_block = -1;
     ContextImpl* context = nullptr;
   };
 
diff --git a/internal/ceres/program.cc b/internal/ceres/program.cc
index fa3fbf1..3a8f0e3 100644
--- a/internal/ceres/program.cc
+++ b/internal/ceres/program.cc
@@ -397,18 +397,20 @@
   return true;
 }
 
-TripletSparseMatrix* Program::CreateJacobianBlockSparsityTranspose() const {
+std::unique_ptr<TripletSparseMatrix>
+Program::CreateJacobianBlockSparsityTranspose(int start_residual_block) const {
   // Matrix to store the block sparsity structure of the Jacobian.
-  TripletSparseMatrix* tsm =
-      new TripletSparseMatrix(NumParameterBlocks(),
-                              NumResidualBlocks(),
-                              10 * NumResidualBlocks());
+  const int num_rows = NumParameterBlocks();
+  const int num_cols = NumResidualBlocks() - start_residual_block;
+
+  std::unique_ptr<TripletSparseMatrix> tsm(
+      new TripletSparseMatrix(num_rows, num_cols, 10 * num_cols));
   int num_nonzeros = 0;
   int* rows = tsm->mutable_rows();
   int* cols = tsm->mutable_cols();
   double* values = tsm->mutable_values();
 
-  for (int c = 0; c < residual_blocks_.size(); ++c) {
+  for (int c = start_residual_block; c < residual_blocks_.size(); ++c) {
     const ResidualBlock* residual_block = residual_blocks_[c];
     const int num_parameter_blocks = residual_block->NumParameterBlocks();
     ParameterBlock* const* parameter_blocks =
@@ -430,7 +432,7 @@
 
       const int r = parameter_blocks[j]->index();
       rows[num_nonzeros] = r;
-      cols[num_nonzeros] = c;
+      cols[num_nonzeros] = c - start_residual_block;
       values[num_nonzeros] = 1.0;
       ++num_nonzeros;
     }
diff --git a/internal/ceres/program.h b/internal/ceres/program.h
index 297be65..dc544c2 100644
--- a/internal/ceres/program.h
+++ b/internal/ceres/program.h
@@ -131,8 +131,10 @@
   // structure corresponding to the block sparsity of the transpose of
   // the Jacobian matrix.
   //
-  // Caller owns the result.
-  TripletSparseMatrix* CreateJacobianBlockSparsityTranspose() const;
+  // start_residual_block which allows the user to ignore the first
+  // start_residual_block residuals.
+  std::unique_ptr<TripletSparseMatrix> CreateJacobianBlockSparsityTranspose(
+      int start_residual_block = 0) const;
 
   // Create a copy of this program and removes constant parameter
   // blocks and residual blocks with no varying parameter blocks while
diff --git a/internal/ceres/program_test.cc b/internal/ceres/program_test.cc
index 6cb8e9e..99379c0 100644
--- a/internal/ceres/program_test.cc
+++ b/internal/ceres/program_test.cc
@@ -238,7 +238,9 @@
   EXPECT_DOUBLE_EQ(fixed_cost, expected_fixed_cost);
 }
 
-TEST(Program, CreateJacobianBlockSparsityTranspose) {
+class BlockJacobianTest : public ::testing::TestWithParam<int> {};
+
+TEST_P(BlockJacobianTest, CreateJacobianBlockSparsityTranspose) {
   ProblemImpl problem;
   double x[2];
   double y[3];
@@ -304,17 +306,26 @@
   Program* program = problem.mutable_program();
   program->SetParameterOffsetsAndIndex();
 
+  const int start_row_block = GetParam();
   std::unique_ptr<TripletSparseMatrix> actual_block_sparse_jacobian(
-      program->CreateJacobianBlockSparsityTranspose());
+      program->CreateJacobianBlockSparsityTranspose(start_row_block));
 
-  Matrix expected_dense_jacobian;
-  expected_block_sparse_jacobian.ToDenseMatrix(&expected_dense_jacobian);
+  Matrix expected_full_dense_jacobian;
+  expected_block_sparse_jacobian.ToDenseMatrix(&expected_full_dense_jacobian);
+  Matrix expected_dense_jacobian =
+      expected_full_dense_jacobian.rightCols(8 - start_row_block);
 
   Matrix actual_dense_jacobian;
   actual_block_sparse_jacobian->ToDenseMatrix(&actual_dense_jacobian);
+  EXPECT_EQ(expected_dense_jacobian.rows(), actual_dense_jacobian.rows());
+  EXPECT_EQ(expected_dense_jacobian.cols(), actual_dense_jacobian.cols());
   EXPECT_EQ((expected_dense_jacobian - actual_dense_jacobian).norm(), 0.0);
 }
 
+INSTANTIATE_TEST_SUITE_P(AllColumns,
+                         BlockJacobianTest,
+                         ::testing::Range(0, 7));
+
 template <int kNumResiduals, int kNumParameterBlocks>
 class NumParameterBlocksCostFunction : public CostFunction {
  public:
diff --git a/internal/ceres/reorder_program.cc b/internal/ceres/reorder_program.cc
index 3ad2a5f..45dbbc4 100644
--- a/internal/ceres/reorder_program.cc
+++ b/internal/ceres/reorder_program.cc
@@ -527,18 +527,14 @@
 
   // Schur type solvers also require that their residual blocks be
   // lexicographically ordered.
-  if (!LexicographicallyOrderResidualBlocks(size_of_first_elimination_group,
-                                            program,
-                                            error)) {
-    return false;
-  }
-
-  return true;
+  return LexicographicallyOrderResidualBlocks(
+      size_of_first_elimination_group, program, error);
 }
 
-bool ReorderProgramForSparseNormalCholesky(
+bool ReorderProgramForSparseCholesky(
     const SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type,
     const ParameterBlockOrdering& parameter_block_ordering,
+    int start_row_block,
     Program* program,
     string* error) {
   if (parameter_block_ordering.NumElements() != program->NumParameterBlocks()) {
@@ -552,7 +548,7 @@
 
   // Compute a block sparse presentation of J'.
   std::unique_ptr<TripletSparseMatrix> tsm_block_jacobian_transpose(
-      program->CreateJacobianBlockSparsityTranspose());
+      program->CreateJacobianBlockSparsityTranspose(start_row_block));
 
   vector<int> ordering(program->NumParameterBlocks(), 0);
   vector<ParameterBlock*>& parameter_blocks =
@@ -602,5 +598,17 @@
   return true;
 }
 
+int ReorderResidualBlocksByPartition(
+    const std::unordered_set<ResidualBlockId>& bottom_residual_blocks,
+    Program* program) {
+  auto residual_blocks = program->mutable_residual_blocks();
+  auto it = std::partition(
+      residual_blocks->begin(), residual_blocks->end(),
+      [&bottom_residual_blocks](ResidualBlock* r) {
+        return bottom_residual_blocks.count(r) == 0;
+      });
+  return it - residual_blocks->begin();
+}
+
 }  // namespace internal
 }  // namespace ceres
diff --git a/internal/ceres/reorder_program.h b/internal/ceres/reorder_program.h
index 36e5d16..88cbee3 100644
--- a/internal/ceres/reorder_program.h
+++ b/internal/ceres/reorder_program.h
@@ -89,12 +89,27 @@
 // fill-reducing ordering is available in the sparse linear algebra
 // library (SuiteSparse version >= 4.2.0) then the fill reducing
 // ordering will take it into account, otherwise it will be ignored.
-bool ReorderProgramForSparseNormalCholesky(
+bool ReorderProgramForSparseCholesky(
     SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type,
     const ParameterBlockOrdering& parameter_block_ordering,
+    int start_row_block,
     Program* program,
     std::string* error);
 
+// Reorder the residual blocks in the program so that all the residual
+// blocks in bottom_residual_blocks are at the bottom. The return
+// value is the number of residual blocks in the program in "top" part
+// of the Program, i.e., the ones not included in
+// bottom_residual_blocks.
+//
+// This number can be different from program->NumResidualBlocks() -
+// bottom_residual_blocks.size() because we allow
+// bottom_residual_blocks to contain residual blocks not present in
+// the Program.
+int ReorderResidualBlocksByPartition(
+    const std::unordered_set<ResidualBlockId>& bottom_residual_blocks,
+    Program* program);
+
 }  // namespace internal
 }  // namespace ceres
 
diff --git a/internal/ceres/reorder_program_test.cc b/internal/ceres/reorder_program_test.cc
index cf3e9f6..a3e2c78 100644
--- a/internal/ceres/reorder_program_test.cc
+++ b/internal/ceres/reorder_program_test.cc
@@ -30,12 +30,12 @@
 
 #include "ceres/reorder_program.h"
 
+#include <random>
 #include "ceres/parameter_block.h"
 #include "ceres/problem_impl.h"
 #include "ceres/program.h"
 #include "ceres/sized_cost_function.h"
 #include "ceres/solver.h"
-
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
@@ -169,7 +169,7 @@
 }
 
 #ifndef CERES_NO_SUITESPARSE
-class ReorderProgramForSparseNormalCholeskyUsingSuiteSparseTest :
+class ReorderProgramFoSparseCholeskyUsingSuiteSparseTest :
       public ::testing::Test {
  protected:
   void SetUp() {
@@ -188,9 +188,10 @@
         program->parameter_blocks();
 
     std::string error;
-    EXPECT_TRUE(ReorderProgramForSparseNormalCholesky(
+    EXPECT_TRUE(ReorderProgramForSparseCholesky(
                     ceres::SUITE_SPARSE,
                     linear_solver_ordering,
+                    0, /* use all rows */
                     program,
                     &error));
     const vector<ParameterBlock*>& ordered_parameter_blocks =
@@ -208,7 +209,7 @@
   double z_;
 };
 
-TEST_F(ReorderProgramForSparseNormalCholeskyUsingSuiteSparseTest,
+TEST_F(ReorderProgramFoSparseCholeskyUsingSuiteSparseTest,
        EverythingInGroupZero) {
   ParameterBlockOrdering linear_solver_ordering;
   linear_solver_ordering.AddElementToGroup(&x_, 0);
@@ -218,7 +219,7 @@
   ComputeAndValidateOrdering(linear_solver_ordering);
 }
 
-TEST_F(ReorderProgramForSparseNormalCholeskyUsingSuiteSparseTest,
+TEST_F(ReorderProgramFoSparseCholeskyUsingSuiteSparseTest,
        ContiguousGroups) {
   ParameterBlockOrdering linear_solver_ordering;
   linear_solver_ordering.AddElementToGroup(&x_, 0);
@@ -228,7 +229,7 @@
   ComputeAndValidateOrdering(linear_solver_ordering);
 }
 
-TEST_F(ReorderProgramForSparseNormalCholeskyUsingSuiteSparseTest,
+TEST_F(ReorderProgramFoSparseCholeskyUsingSuiteSparseTest,
        GroupsWithGaps) {
   ParameterBlockOrdering linear_solver_ordering;
   linear_solver_ordering.AddElementToGroup(&x_, 0);
@@ -238,7 +239,7 @@
   ComputeAndValidateOrdering(linear_solver_ordering);
 }
 
-TEST_F(ReorderProgramForSparseNormalCholeskyUsingSuiteSparseTest,
+TEST_F(ReorderProgramFoSparseCholeskyUsingSuiteSparseTest,
        NonContiguousStartingAtTwo) {
   ParameterBlockOrdering linear_solver_ordering;
   linear_solver_ordering.AddElementToGroup(&x_, 2);
@@ -249,5 +250,45 @@
 }
 #endif  // CERES_NO_SUITESPARSE
 
+TEST(_, ReorderResidualBlocksbyPartition) {
+  ProblemImpl problem;
+  double x;
+  double y;
+  double z;
+
+  problem.AddParameterBlock(&x, 1);
+  problem.AddParameterBlock(&y, 1);
+  problem.AddParameterBlock(&z, 1);
+
+  problem.AddResidualBlock(new UnaryCostFunction(), NULL, &x);
+  problem.AddResidualBlock(new BinaryCostFunction(), NULL, &z, &x);
+  problem.AddResidualBlock(new BinaryCostFunction(), NULL, &z, &y);
+  problem.AddResidualBlock(new UnaryCostFunction(), NULL, &z);
+  problem.AddResidualBlock(new BinaryCostFunction(), NULL, &x, &y);
+  problem.AddResidualBlock(new UnaryCostFunction(), NULL, &y);
+
+  std::vector<ResidualBlockId> residual_block_ids;
+  problem.GetResidualBlocks(&residual_block_ids);
+  std::vector<ResidualBlock*> residual_blocks =
+      problem.program().residual_blocks();
+  auto rng = std::default_random_engine{};
+  for (int i = 1; i < 6; ++i) {
+    std::shuffle(
+        std::begin(residual_block_ids), std::end(residual_block_ids), rng);
+    std::unordered_set<ResidualBlockId> bottom(residual_block_ids.begin(),
+                                               residual_block_ids.begin() + i);
+    const int start_bottom =
+        ReorderResidualBlocksByPartition(bottom, problem.mutable_program());
+    std::vector<ResidualBlock*> actual_residual_blocks =
+        problem.program().residual_blocks();
+    EXPECT_THAT(actual_residual_blocks,
+                testing::UnorderedElementsAreArray(residual_blocks));
+    EXPECT_EQ(start_bottom, residual_blocks.size() - i);
+    for (int j = start_bottom; j < residual_blocks.size(); ++j) {
+      EXPECT_THAT(bottom, ::testing::Contains(actual_residual_blocks[j]));
+    }
+  }
+}
+
 }  // namespace internal
 }  // namespace ceres
diff --git a/internal/ceres/solver.cc b/internal/ceres/solver.cc
index 84a5e5a..b22a5e1 100644
--- a/internal/ceres/solver.cc
+++ b/internal/ceres/solver.cc
@@ -153,47 +153,38 @@
     return false;
   }
 
-  if (options.sparse_linear_algebra_library_type == NO_SPARSE) {
-    const char* error_template =
-        "Can't use %s with "
-        "Solver::Options::sparse_linear_algebra_library_type = NO_SPARSE.";
-    const char* name = nullptr;
-
-    if (options.linear_solver_type == SPARSE_NORMAL_CHOLESKY ||
-        options.linear_solver_type == SPARSE_SCHUR) {
-      name = LinearSolverTypeToString(options.linear_solver_type);
-    } else if (options.linear_solver_type == ITERATIVE_SCHUR &&
-               (options.preconditioner_type == CLUSTER_JACOBI ||
-                options.preconditioner_type == CLUSTER_TRIDIAGONAL)) {
-      name = PreconditionerTypeToString(options.preconditioner_type);
-    }
-
-    if (name != nullptr) {
-      *error = StringPrintf(error_template, name);
-      return false;
-    }
-  } else if (!IsSparseLinearAlgebraLibraryTypeAvailable(
-                 options.sparse_linear_algebra_library_type)) {
-    const char* error_template =
-        "Can't use %s with "
-        "Solver::Options::sparse_linear_algebra_library_type = %s, "
-        "because support was not enabled when Ceres Solver was built.";
+  {
+    const char* sparse_linear_algebra_library_name =
+        SparseLinearAlgebraLibraryTypeToString(
+            options.sparse_linear_algebra_library_type);
     const char* name = nullptr;
     if (options.linear_solver_type == SPARSE_NORMAL_CHOLESKY ||
         options.linear_solver_type == SPARSE_SCHUR) {
       name = LinearSolverTypeToString(options.linear_solver_type);
-    } else if (options.linear_solver_type == ITERATIVE_SCHUR &&
-               (options.preconditioner_type == CLUSTER_JACOBI ||
-                options.preconditioner_type == CLUSTER_TRIDIAGONAL)) {
+    } else if ((options.linear_solver_type == ITERATIVE_SCHUR &&
+                (options.preconditioner_type == CLUSTER_JACOBI ||
+                 options.preconditioner_type == CLUSTER_TRIDIAGONAL)) ||
+               (options.linear_solver_type == CGNR &&
+                options.preconditioner_type == SUBSET)) {
       name = PreconditionerTypeToString(options.preconditioner_type);
     }
 
-    if (name != nullptr) {
-      *error = StringPrintf(error_template,
-                            name,
-                            SparseLinearAlgebraLibraryTypeToString(
-                                options.sparse_linear_algebra_library_type));
-      return false;
+    if (name) {
+      if (options.sparse_linear_algebra_library_type == NO_SPARSE) {
+        *error = StringPrintf(
+            "Can't use %s with "
+            "Solver::Options::sparse_linear_algebra_library_type = %s.",
+            name, sparse_linear_algebra_library_name);
+        return false;
+      } else if (!IsSparseLinearAlgebraLibraryTypeAvailable(
+                     options.sparse_linear_algebra_library_type)) {
+        *error = StringPrintf(
+            "Can't use %s with "
+            "Solver::Options::sparse_linear_algebra_library_type = %s, "
+            "because support was not enabled when Ceres Solver was built.",
+            name, sparse_linear_algebra_library_name);
+        return false;
+      }
     }
   }
 
@@ -226,6 +217,16 @@
     }
   }
 
+  if (options.linear_solver_type == CGNR &&
+      options.preconditioner_type == SUBSET &&
+      options.residual_blocks_for_subset_preconditioner.size() == 0) {
+    *error =
+        "When using SUBSET preconditioner, "
+        "Solver::Options::residual_blocks_for_subset_preconditioner cannot be "
+        "empty";
+    return false;
+  }
+
   return true;
 }
 
diff --git a/internal/ceres/subset_preconditioner.cc b/internal/ceres/subset_preconditioner.cc
index 865c5f1..7c24ae9 100644
--- a/internal/ceres/subset_preconditioner.cc
+++ b/internal/ceres/subset_preconditioner.cc
@@ -44,7 +44,9 @@
 SubsetPreconditioner::SubsetPreconditioner(
     const Preconditioner::Options& options, const BlockSparseMatrix& A)
     : options_(options), num_cols_(A.num_cols()) {
-  CHECK_GE(options_.subset_preconditioner_start_row_block, 0);
+  CHECK_GE(options_.subset_preconditioner_start_row_block, 0)
+      << "Congratulations, you found a bug in Ceres. Please report it.";
+
   LinearSolver::Options sparse_cholesky_options;
   sparse_cholesky_options.sparse_linear_algebra_library_type =
       options_.sparse_linear_algebra_library_type;
diff --git a/internal/ceres/subset_preconditioner.h b/internal/ceres/subset_preconditioner.h
index 77c3d91..afd3704 100644
--- a/internal/ceres/subset_preconditioner.h
+++ b/internal/ceres/subset_preconditioner.h
@@ -58,7 +58,7 @@
 // A = [P]
 //     [Q]
 //
-// where P as subset_preconditioner_start_row_block row blocks,
+// where P has subset_preconditioner_start_row_block row blocks,
 // and the preconditioner is the inverse of the matrix Q'Q.
 //
 // Obviously, the smaller this number, the more accurate and
diff --git a/internal/ceres/trust_region_preprocessor.cc b/internal/ceres/trust_region_preprocessor.cc
index dbeb3e5..498752b 100644
--- a/internal/ceres/trust_region_preprocessor.cc
+++ b/internal/ceres/trust_region_preprocessor.cc
@@ -108,8 +108,7 @@
   VLOG_IF(1, options->logging_type != SILENT) << message;
 }
 
-// For Schur type and SPARSE_NORMAL_CHOLESKY linear solvers, reorder
-// the program to reduce fill-in and increase cache coherency.
+// Reorder the program to reduce fill-in and increase cache coherency.
 bool ReorderProgram(PreprocessedProblem* pp) {
   const Solver::Options& options = pp->options;
   if (IsSchurType(options.linear_solver_type)) {
@@ -122,12 +121,27 @@
         &pp->error);
   }
 
-
   if (options.linear_solver_type == SPARSE_NORMAL_CHOLESKY &&
       !options.dynamic_sparsity) {
-    return ReorderProgramForSparseNormalCholesky(
+    return ReorderProgramForSparseCholesky(
         options.sparse_linear_algebra_library_type,
         *options.linear_solver_ordering,
+        0, /* use all the rows of the jacobian */
+        pp->reduced_program.get(),
+        &pp->error);
+  }
+
+  if (options.linear_solver_type == CGNR &&
+      options.preconditioner_type == SUBSET) {
+    pp->linear_solver_options.subset_preconditioner_start_row_block =
+        ReorderResidualBlocksByPartition(
+            options.residual_blocks_for_subset_preconditioner,
+            pp->reduced_program.get());
+
+    return ReorderProgramForSparseCholesky(
+        options.sparse_linear_algebra_library_type,
+        *options.linear_solver_ordering,
+        pp->linear_solver_options.subset_preconditioner_start_row_block,
         pp->reduced_program.get(),
         &pp->error);
   }
@@ -141,7 +155,9 @@
 // too.
 bool SetupLinearSolver(PreprocessedProblem* pp) {
   Solver::Options& options = pp->options;
-  if (options.linear_solver_ordering.get() == NULL) {
+  pp->linear_solver_options = LinearSolver::Options();
+
+  if (!options.linear_solver_ordering) {
     // If the user has not supplied a linear solver ordering, then we
     // assume that they are giving all the freedom to us in choosing
     // the best possible ordering. This intent can be indicated by
@@ -177,7 +193,6 @@
   }
 
   // Configure the linear solver.
-  pp->linear_solver_options = LinearSolver::Options();
   pp->linear_solver_options.min_num_iterations =
       options.min_linear_solver_iterations;
   pp->linear_solver_options.max_num_iterations =
@@ -236,7 +251,7 @@
   }
 
   pp->linear_solver.reset(LinearSolver::Create(pp->linear_solver_options));
-  return (pp->linear_solver.get() != NULL);
+  return (pp->linear_solver.get() != nullptr);
 }
 
 // Configure and create the evaluator.
@@ -262,7 +277,7 @@
                                         pp->reduced_program.get(),
                                         &pp->error));
 
-  return (pp->evaluator.get() != NULL);
+  return (pp->evaluator.get() != nullptr);
 }
 
 // If the user requested inner iterations, then find an inner
@@ -288,7 +303,7 @@
     return true;
   }
 
-  if (options.inner_iteration_ordering.get() != NULL) {
+  if (options.inner_iteration_ordering.get() != nullptr) {
     // If the user supplied an ordering, then remove the set of
     // inactive parameter blocks from it
     options.inner_iteration_ordering->Remove(pp->removed_parameter_blocks);
diff --git a/internal/ceres/types.cc b/internal/ceres/types.cc
index 4691414..93c4cfc 100644
--- a/internal/ceres/types.cc
+++ b/internal/ceres/types.cc
@@ -78,6 +78,7 @@
     CASESTR(SCHUR_JACOBI);
     CASESTR(CLUSTER_JACOBI);
     CASESTR(CLUSTER_TRIDIAGONAL);
+    CASESTR(SUBSET);
     default:
       return "UNKNOWN";
   }
@@ -90,6 +91,7 @@
   STRENUM(SCHUR_JACOBI);
   STRENUM(CLUSTER_JACOBI);
   STRENUM(CLUSTER_TRIDIAGONAL);
+  STRENUM(SUBSET);
   return false;
 }