CUDA CGNR, Part 4: CudaCgnrSolver

* Added CudaCgnrSolver, a new CUDA-accelerated CGNR.
* To use CudaCgnrSolver, the user must select CGNR as the linear_solver
  and CUDA_SPARSE as the sparse_linear_algebra_library.
* Updated ConjugateGradientSolver to work with an array of pointers to
  scratch to support CudaVectors as scratch.
* Moved CUDA initialization to run in Solver::Solve as needed.

Some performance comparisons on an Ubuntu 20.04 desktop with an
Intel i9-9940X CPU @ 3.30GHz, and an nVidia Quadro RTX 6000,
all configurations run with 24 threads, and 10 iterations.

=================================================
CGNR + CUDA_SPARSE + IDENTITY Preconditioner
problem-1778-993923-pre.txt
=================================================
Cost:
Initial                          2.563973e+08
Final                            1.724755e+06
Change                           2.546725e+08

Minimizer iterations                       11
Successful steps                            7
Unsuccessful steps                          4

Time (in seconds):
Preprocessor                         4.020158

  Residual only evaluation           1.567092 (10)
  Jacobian & residual evaluation     7.847130 (7)
  Linear solver                     31.688898 (10)
Minimizer                           46.834987

Postprocessor                        0.353974
Total                               51.209120

=================================================
SPARSE_SCHUR (CPU) + SUITE_SPARSE + AMD
problem-1778-993923-pre.txt
=================================================
Cost:
Initial                          2.563973e+08
Final                            1.651617e+06
Change                           2.547457e+08

Minimizer iterations                       11
Successful steps                           11
Unsuccessful steps                          0

Time (in seconds):
Preprocessor                        35.812003

  Residual only evaluation           1.658980 (10)
  Jacobian & residual evaluation    12.218799 (11)
  Linear solver                     76.409992 (10)
Minimizer                           98.809773

Postprocessor                        0.372712
Total                              134.994489

=================================================
ITERATIVE_SCHUR (CPU) + JACOBI Preconditioner
problem-1778-993923-pre.txt
=================================================
Cost:
Initial                          2.563973e+08
Final                            1.684447e+06
Change                           2.547128e+08

Minimizer iterations                       11
Successful steps                            8
Unsuccessful steps                          3

Time (in seconds):
Preprocessor                        15.331614

  Residual only evaluation           1.606114 (10)
  Jacobian & residual evaluation     8.502166 (8)
  Linear solver                    351.910080 (10)
Minimizer                          368.797327

Postprocessor                        0.363536
Total                              384.492478

=================================================
CGNR + CUDA_SPARSE + IDENTITY Preconditioner
problem-13682-4456117-pre.txt
=================================================
Cost:
Initial                          1.126372e+09
Final                            2.269329e+07
Change                           1.103678e+09

Minimizer iterations                       11
Successful steps                            7
Unsuccessful steps                          4

Time (in seconds):
Preprocessor                        19.140087

  Residual only evaluation           8.721920 (10)
  Jacobian & residual evaluation    41.955923 (7)
  Linear solver                    214.121861 (10)
Minimizer                          296.636890

Postprocessor                        1.971827
Total                              317.748804

Change-Id: I3a09f31aa6903f661e91f595afd39d427583e856
diff --git a/examples/bundle_adjuster.cc b/examples/bundle_adjuster.cc
index 27ea80e..b05c3a2 100644
--- a/examples/bundle_adjuster.cc
+++ b/examples/bundle_adjuster.cc
@@ -81,8 +81,9 @@
               "automatic, cameras, points, cameras,points, points,cameras");
 
 DEFINE_string(linear_solver, "sparse_schur", "Options are: "
-              "sparse_schur, dense_schur, iterative_schur, sparse_normal_cholesky, "
-              "dense_qr, dense_normal_cholesky and cgnr.");
+              "sparse_schur, dense_schur, iterative_schur, "
+              "sparse_normal_cholesky, dense_qr, dense_normal_cholesky, "
+              "and cgnr.");
 DEFINE_bool(explicit_schur_complement, false, "If using ITERATIVE_SCHUR "
             "then explicitly compute the Schur complement.");
 DEFINE_string(preconditioner, "jacobi", "Options are: "
@@ -94,11 +95,13 @@
             "Use power series expansion to initialize the solution in ITERATIVE_SCHUR linear solver.");
 
 DEFINE_string(sparse_linear_algebra_library, "suite_sparse",
-              "Options are: suite_sparse, cx_sparse, accelerate_sparse and eigen_sparse.");
+              "Options are: suite_sparse, accelerate_sparse, eigen_sparse, and "
+              "cuda_sparse.");
 DEFINE_string(dense_linear_algebra_library, "eigen",
               "Options are: eigen, lapack, and cuda");
 DEFINE_string(ordering_type, "amd", "Options are: amd, nesdis");
-DEFINE_string(linear_solver_ordering, "automatic", "Options are: automatic and user");
+DEFINE_string(linear_solver_ordering, "automatic",
+              "Options are: automatic and user");
 
 DEFINE_bool(use_quaternions, false, "If true, uses quaternions to represent "
             "rotations. If false, angle axis is used.");
@@ -351,6 +354,7 @@
   SetSolverOptionsFromFlags(&bal_problem, &options);
   options.gradient_tolerance = 1e-16;
   options.function_tolerance = 1e-16;
+  options.parameter_tolerance = 1e-16;
   Solver::Summary summary;
   Solve(options, &problem, &summary);
   std::cout << summary.FullReport() << "\n";
diff --git a/include/ceres/types.h b/include/ceres/types.h
index 1b07ab3..0a110e2 100644
--- a/include/ceres/types.h
+++ b/include/ceres/types.h
@@ -137,7 +137,7 @@
   // well the matrix Q approximates J'J, or how well the chosen
   // residual blocks approximate the non-linear least squares
   // problem.
-  SUBSET,
+  SUBSET
 };
 
 enum VisibilityClusteringType {
@@ -175,6 +175,9 @@
   // Apple's Accelerate framework sparse linear algebra routines.
   ACCELERATE_SPARSE,
 
+  // Nvidia's cuSPARSE library.
+  CUDA_SPARSE,
+
   // No sparse linear solver should be used.  This does not necessarily
   // imply that Ceres was built without any sparse library, although that
   // is the likely use case, merely that one should not be used.
diff --git a/internal/ceres/cgnr_solver.cc b/internal/ceres/cgnr_solver.cc
index 99d5375..f4f9e0f 100644
--- a/internal/ceres/cgnr_solver.cc
+++ b/internal/ceres/cgnr_solver.cc
@@ -35,12 +35,15 @@
 
 #include "ceres/block_jacobi_preconditioner.h"
 #include "ceres/conjugate_gradients_solver.h"
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/cuda_vector.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/linear_solver.h"
 #include "ceres/subset_preconditioner.h"
 #include "ceres/wall_time.h"
 #include "glog/logging.h"
 
+
 namespace ceres::internal {
 
 // A linear operator which takes a matrix A and a diagonal vector D and
@@ -117,7 +120,14 @@
   }
 }
 
-CgnrSolver::~CgnrSolver() = default;
+CgnrSolver::~CgnrSolver()  {
+  for (int i = 0; i < 4; ++i) {
+    if (scratch_[i]) {
+      delete scratch_[i];
+      scratch_[i] = nullptr;
+    }
+  }
+}
 
 LinearSolver::Summary CgnrSolver::SolveImpl(
     BlockSparseMatrix* A,
@@ -163,10 +173,13 @@
 
   cg_solution_ = Vector::Zero(A->num_cols());
   for (int i = 0; i < 4; ++i) {
-    scratch_[i] = Vector::Zero(A->num_cols());
+    if (scratch_[i] == nullptr) {
+      scratch_[i] = new Vector(A->num_cols());
+    }
   }
   event_logger.AddEvent("Setup");
 
+
   LinearOperatorAdapter preconditioner(*preconditioner_);
   auto summary = ConjugateGradientsSolver(
       cg_options, lhs, rhs, preconditioner, scratch_, cg_solution_);
@@ -175,4 +188,135 @@
   return summary;
 }
 
+#ifndef CERES_NO_CUDA
+
+// A linear operator which takes a matrix A and a diagonal vector D and
+// performs products of the form
+//
+//   (A^T A + D^T D)x
+//
+// This is used to implement iterative general sparse linear solving with
+// conjugate gradients, where A is the Jacobian and D is a regularizing
+// parameter. A brief proof is included in cgnr_linear_operator.h.
+class CERES_NO_EXPORT CudaCgnrLinearOperator final : public
+    ConjugateGradientsLinearOperator<CudaVector> {
+ public:
+  CudaCgnrLinearOperator(CudaSparseMatrix& A,
+                         const CudaVector& D,
+                         CudaVector* z) : A_(A), D_(D), z_(z) {}
+
+  void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector& y) final {
+    // z = Ax
+    z_->SetZero();
+    A_.RightMultiplyAndAccumulate(x, z_);
+
+    // y = y + Atz
+    //   = y + AtAx
+    A_.LeftMultiplyAndAccumulate(*z_, &y);
+
+    // y = y + DtDx
+    y.DtDxpy(D_, x);
+  }
+
+ private:
+  CudaSparseMatrix& A_;
+  const CudaVector& D_;
+  CudaVector* z_ = nullptr;
+};
+
+class CERES_NO_EXPORT CudaIdentityPreconditioner final : public
+    ConjugateGradientsLinearOperator<CudaVector> {
+ public:
+  void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector& y) final {
+    y.Axpby(1.0, x, 1.0);
+  }
+};
+
+CudaCgnrSolver::CudaCgnrSolver(LinearSolver::Options options) :
+    options_(std::move(options)) {}
+
+CudaCgnrSolver::~CudaCgnrSolver() {
+  for (int i = 0; i < 4; ++i) {
+    if (scratch_[i]) {
+      delete scratch_[i];
+      scratch_[i] = nullptr;
+    }
+  }
+}
+
+std::unique_ptr<CudaCgnrSolver> CudaCgnrSolver::Create(
+      LinearSolver::Options options, std::string* error) {
+  CHECK(error != nullptr);
+  if (options.preconditioner_type != IDENTITY) {
+    *error = "CudaCgnrSolver does not support preconditioner type " +
+        std::string(PreconditionerTypeToString(options.preconditioner_type)) + ". ";
+    return nullptr;
+  }
+  CHECK(options.context->IsCUDAInitialized())
+      << "CudaCgnrSolver requires CUDA initialization.";
+  auto solver = std::make_unique<CudaCgnrSolver>(options);
+  return solver;
+}
+
+void CudaCgnrSolver::CpuToGpuTransfer(
+    const CompressedRowSparseMatrix& A, const double* b, const double* D) {
+  if (A_ == nullptr) {
+    // Assume structure is not cached, do an initialization and structural copy.
+    A_ = std::make_unique<CudaSparseMatrix>(options_.context, A);
+    b_ = std::make_unique<CudaVector>(options_.context, A.num_rows());
+    x_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
+    Atb_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
+    Ax_ = std::make_unique<CudaVector>(options_.context, A.num_rows());
+    D_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
+    for (int i = 0; i < 4; ++i) {
+      scratch_[i] = new CudaVector(options_.context, A.num_cols());
+    }
+  } else {
+    // Assume structure is cached, do a value copy.
+    A_->CopyValuesFromCpu(A);
+  }
+  b_->CopyFromCpu(ConstVectorRef(b, A.num_rows()));
+  D_->CopyFromCpu(ConstVectorRef(D, A.num_cols()));
+}
+
+LinearSolver::Summary CudaCgnrSolver::SolveImpl(
+    CompressedRowSparseMatrix* A,
+    const double* b,
+    const LinearSolver::PerSolveOptions& per_solve_options,
+    double* x) {
+  EventLogger event_logger("CudaCgnrSolver::Solve");
+  LinearSolver::Summary summary;
+  summary.num_iterations = 0;
+  summary.termination_type = LinearSolverTerminationType::FATAL_ERROR;
+
+  CpuToGpuTransfer(*A, b, per_solve_options.D);
+  event_logger.AddEvent("CPU to GPU Transfer");
+
+  // Form z = Atb.
+  Atb_->SetZero();
+  A_->LeftMultiplyAndAccumulate(*b_, Atb_.get());
+
+  // Solve (AtA + DtD)x = z (= Atb).
+  x_->SetZero();
+  CudaCgnrLinearOperator lhs(*A_, *D_, Ax_.get());
+
+  event_logger.AddEvent("Setup");
+
+  ConjugateGradientsSolverOptions cg_options;
+  cg_options.min_num_iterations = options_.min_num_iterations;
+  cg_options.max_num_iterations = options_.max_num_iterations;
+  cg_options.residual_reset_period = options_.residual_reset_period;
+  cg_options.q_tolerance = per_solve_options.q_tolerance;
+  cg_options.r_tolerance = per_solve_options.r_tolerance;
+
+  CudaIdentityPreconditioner preconditioner;
+  summary = ConjugateGradientsSolver(
+      cg_options, lhs, *Atb_, preconditioner, scratch_, *x_);
+  x_->CopyTo(x);
+  event_logger.AddEvent("Solve");
+  return summary;
+}
+
+#endif  // CERES_NO_CUDA
+
 }  // namespace ceres::internal
diff --git a/internal/ceres/cgnr_solver.h b/internal/ceres/cgnr_solver.h
index 6982296..a971c81 100644
--- a/internal/ceres/cgnr_solver.h
+++ b/internal/ceres/cgnr_solver.h
@@ -33,6 +33,8 @@
 
 #include <memory>
 
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/cuda_vector.h"
 #include "ceres/internal/export.h"
 #include "ceres/linear_solver.h"
 
@@ -65,9 +67,40 @@
   const LinearSolver::Options options_;
   std::unique_ptr<Preconditioner> preconditioner_;
   Vector cg_solution_;
-  Vector scratch_[4];
+  Vector* scratch_[4] = {nullptr, nullptr, nullptr, nullptr};
 };
 
+#ifndef CERES_NO_CUDA
+// A Cuda-accelerated version of CgnrSolver.
+// This solver assumes that the sparsity structure of A remains constant for its
+// lifetime.
+class CERES_NO_EXPORT CudaCgnrSolver final : public CompressedRowSparseMatrixSolver {
+ public:
+  explicit CudaCgnrSolver(LinearSolver::Options options);
+  static std::unique_ptr<CudaCgnrSolver> Create(
+      LinearSolver::Options options, std::string* error);
+  ~CudaCgnrSolver() override;
+
+  Summary SolveImpl(CompressedRowSparseMatrix* A,
+                    const double* b,
+                    const LinearSolver::PerSolveOptions& per_solve_options,
+                    double* x) final;
+
+ private:
+  void CpuToGpuTransfer(
+      const CompressedRowSparseMatrix& A, const double* b, const double* D);
+
+  LinearSolver::Options options_;
+  std::unique_ptr<CudaSparseMatrix> A_;
+  std::unique_ptr<CudaVector> b_;
+  std::unique_ptr<CudaVector> x_;
+  std::unique_ptr<CudaVector> Atb_;
+  std::unique_ptr<CudaVector> Ax_;
+  std::unique_ptr<CudaVector> D_;
+  CudaVector* scratch_[4] = {nullptr, nullptr, nullptr, nullptr};
+};
+#endif  // CERES_NO_CUDA
+
 }  // namespace ceres::internal
 
 #endif  // CERES_INTERNAL_CGNR_SOLVER_H_
diff --git a/internal/ceres/conjugate_gradients_solver.h b/internal/ceres/conjugate_gradients_solver.h
index f2eea87..0c3581b 100644
--- a/internal/ceres/conjugate_gradients_solver.h
+++ b/internal/ceres/conjugate_gradients_solver.h
@@ -100,25 +100,25 @@
 // allows us to have a single implementation that works on CPU and GPU based
 // matrices and vectors.
 //
-// scratch must contain four DenseVector objects of the same size as rhs and
-// solution. By asking the user for scratch space, we guarantee that we will not
-// perform any allocations inside this function.
+// scratch must contain pointers to four DenseVector objects of the same size as
+// rhs and solution. By asking the user for scratch space, we guarantee that we
+// will not perform any allocations inside this function.
 template <typename DenseVectorType>
 LinearSolver::Summary ConjugateGradientsSolver(
     const ConjugateGradientsSolverOptions options,
     ConjugateGradientsLinearOperator<DenseVectorType>& lhs,
     const DenseVectorType& rhs,
     ConjugateGradientsLinearOperator<DenseVectorType>& preconditioner,
-    DenseVectorType scratch[4],
+    DenseVectorType* scratch[4],
     DenseVectorType& solution) {
   auto IsZeroOrInfinity = [](double x) {
     return ((x == 0.0) || std::isinf(x));
   };
 
-  DenseVectorType& p = scratch[0];
-  DenseVectorType& r = scratch[1];
-  DenseVectorType& z = scratch[2];
-  DenseVectorType& tmp = scratch[3];
+  DenseVectorType& p = *scratch[0];
+  DenseVectorType& r = *scratch[1];
+  DenseVectorType& z = *scratch[2];
+  DenseVectorType& tmp = *scratch[3];
 
   LinearSolver::Summary summary;
   summary.termination_type = LinearSolverTerminationType::NO_CONVERGENCE;
@@ -160,7 +160,7 @@
     SetZero(z);
     preconditioner.RightMultiplyAndAccumulate(r, z);
 
-    double last_rho = rho;
+    const double last_rho = rho;
     // rho = r.dot(z);
     rho = Dot(r, z);
     if (IsZeroOrInfinity(rho)) {
@@ -172,7 +172,7 @@
     if (summary.num_iterations == 1) {
       Copy(z, p);
     } else {
-      double beta = rho / last_rho;
+      const double beta = rho / last_rho;
       if (IsZeroOrInfinity(beta)) {
         summary.termination_type = LinearSolverTerminationType::FAILURE;
         summary.message = StringPrintf(
diff --git a/internal/ceres/conjugate_gradients_solver_test.cc b/internal/ceres/conjugate_gradients_solver_test.cc
index a01dfc9..e59eba7 100644
--- a/internal/ceres/conjugate_gradients_solver_test.cc
+++ b/internal/ceres/conjugate_gradients_solver_test.cc
@@ -74,9 +74,10 @@
   IdentityPreconditioner identity(A->num_cols());
   LinearOperatorAdapter lhs(*A);
   LinearOperatorAdapter preconditioner(identity);
-
-  auto summary =
-      ConjugateGradientsSolver(cg_options, lhs, b, preconditioner, scratch, x);
+  Vector* scratch_array[4] = 
+      {&scratch[0], &scratch[1], &scratch[2], &scratch[3]};
+  auto summary = ConjugateGradientsSolver(
+      cg_options, lhs, b, preconditioner, scratch_array, x);
 
   EXPECT_EQ(summary.termination_type, LinearSolverTerminationType::SUCCESS);
   ASSERT_EQ(summary.num_iterations, 1);
@@ -135,13 +136,14 @@
   for (int i = 0; i < 4; ++i) {
     scratch[i] = Vector::Zero(A->num_cols());
   }
-
+  Vector* scratch_array[4] = 
+      {&scratch[0], &scratch[1], &scratch[2], &scratch[3]};
   IdentityPreconditioner identity(A->num_cols());
   LinearOperatorAdapter lhs(*A);
   LinearOperatorAdapter preconditioner(identity);
 
-  auto summary =
-      ConjugateGradientsSolver(cg_options, lhs, b, preconditioner, scratch, x);
+  auto summary = ConjugateGradientsSolver(
+      cg_options, lhs, b, preconditioner, scratch_array, x);
 
   EXPECT_EQ(summary.termination_type, LinearSolverTerminationType::SUCCESS);
 
diff --git a/internal/ceres/cuda_dense_cholesky_test.cc b/internal/ceres/cuda_dense_cholesky_test.cc
index c7b11ce..1483923 100644
--- a/internal/ceres/cuda_dense_cholesky_test.cc
+++ b/internal/ceres/cuda_dense_cholesky_test.cc
@@ -36,8 +36,7 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 
-namespace ceres {
-namespace internal {
+namespace ceres::internal {
 
 #ifndef CERES_NO_CUDA
 
@@ -45,6 +44,8 @@
   LinearSolver::Options options;
   ContextImpl context;
   options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCUDA(&error)) << error;
   auto dense_cuda_solver = CUDADenseCholesky::Create(options);
   EXPECT_EQ(dense_cuda_solver, nullptr);
 }
@@ -63,6 +64,8 @@
   LinearSolver::Options options;
   ContextImpl context;
   options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCUDA(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   auto dense_cuda_solver = CUDADenseCholesky::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
@@ -90,6 +93,8 @@
   LinearSolver::Options options;
   ContextImpl context;
   options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCUDA(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   auto dense_cuda_solver = CUDADenseCholesky::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
@@ -110,6 +115,8 @@
   LinearSolver::Options options;
   ContextImpl context;
   options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCUDA(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   auto dense_cuda_solver = CUDADenseCholesky::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
@@ -123,6 +130,8 @@
   LinearSolver::Options options;
   ContextImpl context;
   options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCUDA(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   auto dense_cuda_solver = CUDADenseCholesky::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
@@ -140,6 +149,8 @@
   LinearSolver::Options options;
   ContextImpl context;
   options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCUDA(&error)) << error;
   options.dense_linear_algebra_library_type = ceres::CUDA;
   std::unique_ptr<DenseCholesky> dense_cholesky =
       CUDADenseCholesky::Create(options);
@@ -175,12 +186,20 @@
   {
     // Did not ask for CUDA, and did not ask for mixed precision.
     LinearSolver::Options options;
+    ContextImpl context;
+    options.context = &context;
+    std::string error;
+    EXPECT_TRUE(context.InitCUDA(&error)) << error;
     auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
     ASSERT_EQ(solver, nullptr);
   }
   {
     // Asked for CUDA, but did not ask for mixed precision.
     LinearSolver::Options options;
+    ContextImpl context;
+    options.context = &context;
+    std::string error;
+    EXPECT_TRUE(context.InitCUDA(&error)) << error;
     options.dense_linear_algebra_library_type = ceres::CUDA;
     auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
     ASSERT_EQ(solver, nullptr);
@@ -204,6 +223,8 @@
   options.max_num_refinement_iterations = 0;
   ContextImpl context;
   options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCUDA(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   options.use_mixed_precision_solves = true;
   auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
@@ -239,6 +260,8 @@
   options.max_num_refinement_iterations = 3;
   ContextImpl context;
   options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCUDA(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   options.use_mixed_precision_solves = true;
   auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
@@ -267,6 +290,8 @@
   LinearSolver::Options options;
   ContextImpl context;
   options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCUDA(&error)) << error;
   options.dense_linear_algebra_library_type = ceres::CUDA;
   options.use_mixed_precision_solves = true;
   options.max_num_refinement_iterations = 20;
@@ -302,5 +327,4 @@
 
 #endif  // CERES_NO_CUDA
 
-}  // namespace internal
-}  // namespace ceres
+}  // namespace ceres::internal
diff --git a/internal/ceres/cuda_dense_qr_test.cc b/internal/ceres/cuda_dense_qr_test.cc
index 9eb5d4d..cc0dbd6 100644
--- a/internal/ceres/cuda_dense_qr_test.cc
+++ b/internal/ceres/cuda_dense_qr_test.cc
@@ -41,6 +41,10 @@
 
 TEST(CUDADenseQR, InvalidOptionOnCreate) {
   LinearSolver::Options options;
+  ContextImpl context;
+  options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCUDA(&error)) << error;
   auto dense_cuda_solver = CUDADenseQR::Create(options);
   EXPECT_EQ(dense_cuda_solver, nullptr);
 }
@@ -58,6 +62,8 @@
   LinearSolver::Options options;
   ContextImpl context;
   options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCUDA(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   auto dense_cuda_solver = CUDADenseQR::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
@@ -90,6 +96,8 @@
   LinearSolver::Options options;
   ContextImpl context;
   options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCUDA(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   auto dense_cuda_solver = CUDADenseQR::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
@@ -112,6 +120,8 @@
   LinearSolver::Options options;
   ContextImpl context;
   options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCUDA(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   auto dense_cuda_solver = CUDADenseQR::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
@@ -130,10 +140,12 @@
   LinearSolver::Options options;
   ContextImpl context;
   options.context = &context;
+  std::string error;
+  EXPECT_TRUE(context.InitCUDA(&error)) << error;
   options.dense_linear_algebra_library_type = ceres::CUDA;
   std::unique_ptr<DenseQR> dense_qr = CUDADenseQR::Create(options);
 
-  const int kNumTrials = 100;
+  const int kNumTrials = 20;
   for (int i = 0; i < kNumTrials; ++i) {
     LhsType lhs = LhsType::Random(kNumRows, kNumCols);
     SolutionType x_expected = SolutionType::Random(kNumCols);
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index 244de74..af42e73 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc
@@ -348,9 +348,8 @@
 #ifndef CERES_NO_CUDA
 
 bool CUDADenseCholesky::Init(ContextImpl* context, std::string* message) {
-  if (!context->InitCUDA(message)) {
-    return false;
-  }
+  CHECK(context->IsCUDAInitialized())
+      << "CUDADenseCholesky requires CUDA initialization.";
   cusolver_handle_ = context->cusolver_handle_;
   stream_ = context->stream_;
   error_.Reserve(1);
@@ -491,9 +490,8 @@
 
 bool CUDADenseCholeskyMixedPrecision::Init(const LinearSolver::Options& options,
                                            std::string* message) {
-  if (!options.context->InitCUDA(message)) {
-    return false;
-  }
+  CHECK(options.context->IsCUDAInitialized())
+      << "CUDADenseCholeskyMixedPrecision requires CUDA initialization.";
   cusolver_handle_ = options.context->cusolver_handle_;
   cublas_handle_ = options.context->cublas_handle_;
   stream_ = options.context->stream_;
diff --git a/internal/ceres/dense_cholesky_test.cc b/internal/ceres/dense_cholesky_test.cc
index 1c0df56..34ff4f3 100644
--- a/internal/ceres/dense_cholesky_test.cc
+++ b/internal/ceres/dense_cholesky_test.cc
@@ -76,6 +76,8 @@
   LinearSolver::Options options;
   ContextImpl context;
   options.context = &context;
+  std::string error;
+  CHECK(context.InitCUDA(&error)) << error;
   options.dense_linear_algebra_library_type = ::testing::get<0>(GetParam());
   options.use_mixed_precision_solves = ::testing::get<1>(GetParam());
   const int kNumRefinementSteps = 4;
diff --git a/internal/ceres/dense_qr_test.cc b/internal/ceres/dense_qr_test.cc
index acef750..f562d97 100644
--- a/internal/ceres/dense_qr_test.cc
+++ b/internal/ceres/dense_qr_test.cc
@@ -68,6 +68,8 @@
   LinearSolver::Options options;
   ContextImpl context;
   options.context = &context;
+  std::string error;
+  CHECK(context.InitCUDA(&error)) << error;
   options.dense_linear_algebra_library_type = GetParam();
   const double kEpsilon = std::numeric_limits<double>::epsilon() * 1.5e4;
   std::unique_ptr<DenseQR> dense_qr = DenseQR::Create(options);
diff --git a/internal/ceres/evaluator.cc b/internal/ceres/evaluator.cc
index f44796a..91cf221 100644
--- a/internal/ceres/evaluator.cc
+++ b/internal/ceres/evaluator.cc
@@ -64,10 +64,17 @@
     case DENSE_SCHUR:
     case SPARSE_SCHUR:
     case ITERATIVE_SCHUR:
-    case CGNR:
-      return std::make_unique<
-          ProgramEvaluator<BlockEvaluatePreparer, BlockJacobianWriter>>(
-          options, program);
+    case CGNR: {
+      if (options.sparse_linear_algebra_library_type == CUDA_SPARSE) {
+        return std::make_unique<
+          ProgramEvaluator<ScratchEvaluatePreparer,
+                           CompressedRowJacobianWriter>>(options, program);
+      } else {
+        return std::make_unique<
+            ProgramEvaluator<BlockEvaluatePreparer, BlockJacobianWriter>>(
+            options, program);
+      }
+    }
     case SPARSE_NORMAL_CHOLESKY:
       if (options.dynamic_sparsity) {
         return std::make_unique<
diff --git a/internal/ceres/evaluator.h b/internal/ceres/evaluator.h
index 68a4fb2..7ea523c 100644
--- a/internal/ceres/evaluator.h
+++ b/internal/ceres/evaluator.h
@@ -65,6 +65,7 @@
     int num_threads = 1;
     int num_eliminate_blocks = -1;
     LinearSolverType linear_solver_type = DENSE_QR;
+    SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type = NO_SPARSE;
     bool dynamic_sparsity = false;
     ContextImpl* context = nullptr;
     EvaluationCallback* evaluation_callback = nullptr;
diff --git a/internal/ceres/iterative_schur_complement_solver.cc b/internal/ceres/iterative_schur_complement_solver.cc
index 607502e..0463afb 100644
--- a/internal/ceres/iterative_schur_complement_solver.cc
+++ b/internal/ceres/iterative_schur_complement_solver.cc
@@ -129,6 +129,9 @@
   for (int i = 0; i < 4; ++i) {
     scratch[i] = Vector::Zero(schur_complement_->num_cols());
   }
+  Vector* scratch_ptr[4] = {
+      &scratch[0], &scratch[1], &scratch[2], &scratch[3]
+  };
 
   event_logger.AddEvent("Setup");
 
@@ -137,7 +140,7 @@
                                lhs,
                                schur_complement_->rhs(),
                                preconditioner,
-                               scratch,
+                               scratch_ptr,
                                reduced_linear_system_solution_);
 
   if (summary.termination_type != LinearSolverTerminationType::FAILURE &&
diff --git a/internal/ceres/linear_solver.cc b/internal/ceres/linear_solver.cc
index d9ef150..2e3fe4b 100644
--- a/internal/ceres/linear_solver.cc
+++ b/internal/ceres/linear_solver.cc
@@ -76,8 +76,15 @@
   CHECK(options.context != nullptr);
 
   switch (options.type) {
-    case CGNR:
+    case CGNR: {
+#ifndef CERES_NO_CUDA
+      if (options.sparse_linear_algebra_library_type == CUDA_SPARSE) {
+        std::string error;
+        return CudaCgnrSolver::Create(options, &error);
+      }
+#endif
       return std::make_unique<CgnrSolver>(options);
+    } break;
 
     case SPARSE_NORMAL_CHOLESKY:
 #if defined(CERES_NO_SPARSE)
diff --git a/internal/ceres/schur_complement_solver.cc b/internal/ceres/schur_complement_solver.cc
index ba96e1c..631862c 100644
--- a/internal/ceres/schur_complement_solver.cc
+++ b/internal/ceres/schur_complement_solver.cc
@@ -217,7 +217,14 @@
   }
 }
 
-SparseSchurComplementSolver::~SparseSchurComplementSolver() = default;
+SparseSchurComplementSolver::~SparseSchurComplementSolver() {
+  for (int i = 0; i < 4; ++i) {
+    if (scratch_[i]) {
+      delete scratch_[i];
+      scratch_[i] = nullptr;
+    }
+  }
+}
 
 // Determine the non-zero blocks in the Schur Complement matrix, and
 // initialize a BlockRandomAccessSparseMatrix object.
@@ -394,11 +401,11 @@
   cg_options.r_tolerance = per_solve_options.r_tolerance;
 
   cg_solution_ = Vector::Zero(sc->num_rows());
-  Vector scratch[4];
   for (int i = 0; i < 4; ++i) {
-    scratch_[i] = Vector::Zero(sc->num_rows());
+    if (scratch_[i] == nullptr) {
+      scratch_[i] = new Vector(sc->num_rows());
+    }
   }
-
   auto summary = ConjugateGradientsSolver<Vector>(
       cg_options, *lhs, rhs(), *preconditioner, scratch_, cg_solution_);
   VectorRef(solution, sc->num_rows()) = cg_solution_;
diff --git a/internal/ceres/schur_complement_solver.h b/internal/ceres/schur_complement_solver.h
index 4fde69c..cac96b5 100644
--- a/internal/ceres/schur_complement_solver.h
+++ b/internal/ceres/schur_complement_solver.h
@@ -188,7 +188,7 @@
   std::unique_ptr<SparseCholesky> sparse_cholesky_;
   std::unique_ptr<BlockRandomAccessDiagonalMatrix> preconditioner_;
   Vector cg_solution_;
-  Vector scratch_[4];
+  Vector* scratch_[4] = {nullptr, nullptr, nullptr, nullptr};
 };
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/solver.cc b/internal/ceres/solver.cc
index b772f3f..0e0790e 100644
--- a/internal/ceres/solver.cc
+++ b/internal/ceres/solver.cc
@@ -327,7 +327,8 @@
     return false;
   }
 
-  if (options.preconditioner_type == SUBSET) {
+  if (options.sparse_linear_algebra_library_type != CUDA_SPARSE &&
+      options.preconditioner_type == SUBSET) {
     if (options.residual_blocks_for_subset_preconditioner.empty()) {
       *error =
           "When using SUBSET preconditioner, "
@@ -341,6 +342,20 @@
     }
   }
 
+  // Check options for CGNR with CUDA_SPARSE.
+  if (options.sparse_linear_algebra_library_type == CUDA_SPARSE) {
+    if (!IsSparseLinearAlgebraLibraryTypeAvailable(CUDA_SPARSE)) {
+      *error = "Can't use CGNR with sparse_linear_algebra_library_type = "
+          "CUDA_SPARSE because support was not enabled when Ceres was built.";
+      return false;
+    }
+    if (options.preconditioner_type != IDENTITY) {
+      StringPrintf("Can't use CGNR with preconditioner_type = %s when "
+                   "sparse_linear_algebra_library_type = CUDA_SPARSE.",
+                    PreconditionerTypeToString(options.preconditioner_type));
+      return false;
+    }
+  }
   return true;
 }
 
@@ -652,6 +667,18 @@
   return internal::StringPrintf("%s,%s,%s", row.c_str(), e.c_str(), f.c_str());
 }
 
+bool IsCudaRequired(const Solver::Options& options) {
+  if (options.linear_solver_type == DENSE_NORMAL_CHOLESKY ||
+      options.linear_solver_type == DENSE_SCHUR ||
+      options.linear_solver_type == DENSE_QR) {
+    return (options.dense_linear_algebra_library_type == CUDA);
+  }
+  if (options.linear_solver_type == CGNR) {
+    return (options.sparse_linear_algebra_library_type == CUDA_SPARSE);
+  }
+  return false;
+}
+
 }  // namespace
 
 bool Solver::Options::IsValid(string* error) const {
@@ -697,6 +724,13 @@
   Program* program = problem_impl->mutable_program();
   PreSolveSummarize(options, problem_impl, summary);
 
+  if (IsCudaRequired(options)) {
+    if (!problem_impl->context()->InitCUDA(&summary->message)) {
+      LOG(ERROR) << "Terminating: " << summary->message;
+      return;
+    }
+  }
+
   // If gradient_checking is enabled, wrap all cost functions in a
   // gradient checker and install a callback that terminates if any gradient
   // error is detected.
@@ -866,22 +900,40 @@
       }
     }
 
-    if (linear_solver_type_used == SPARSE_NORMAL_CHOLESKY ||
+    const bool used_sparse_linear_algebra_library =
+        linear_solver_type_used == SPARSE_NORMAL_CHOLESKY ||
         linear_solver_type_used == SPARSE_SCHUR ||
-        (linear_solver_type_used == CGNR &&
-         preconditioner_type_used == SUBSET) ||
+        linear_solver_type_used == CGNR ||
         (linear_solver_type_used == ITERATIVE_SCHUR &&
-         (preconditioner_type_used == CLUSTER_JACOBI ||
-          preconditioner_type_used == CLUSTER_TRIDIAGONAL))) {
+            (preconditioner_type_used == CLUSTER_JACOBI ||
+            preconditioner_type_used == CLUSTER_TRIDIAGONAL));
+
+    const bool linear_solver_ordering_required =
+        linear_solver_type_used == SPARSE_SCHUR ||
+        (linear_solver_type_used == ITERATIVE_SCHUR &&
+            (preconditioner_type_used == CLUSTER_JACOBI ||
+            preconditioner_type_used == CLUSTER_TRIDIAGONAL)) ||
+        (linear_solver_type_used == CGNR && preconditioner_type_used == SUBSET);
+
+    if (used_sparse_linear_algebra_library) {
       const char* mixed_precision_suffix =
           (mixed_precision_solves_used ? "(Mixed Precision)" : "");
-      StringAppendF(
-          &report,
-          "\nSparse linear algebra library %15s + %s %s\n",
-          SparseLinearAlgebraLibraryTypeToString(
-              sparse_linear_algebra_library_type),
-          LinearSolverOrderingTypeToString(linear_solver_ordering_type),
-          mixed_precision_suffix);
+      if (linear_solver_ordering_required) {
+        StringAppendF(
+            &report,
+            "\nSparse linear algebra library %15s + %s %s\n",
+            SparseLinearAlgebraLibraryTypeToString(
+                sparse_linear_algebra_library_type),
+            LinearSolverOrderingTypeToString(linear_solver_ordering_type),
+            mixed_precision_suffix);
+      } else {
+        StringAppendF(
+            &report,
+            "\nSparse linear algebra library %15s %s\n",
+            SparseLinearAlgebraLibraryTypeToString(
+                sparse_linear_algebra_library_type),
+            mixed_precision_suffix);
+      }
     }
 
     StringAppendF(&report, "\n");
diff --git a/internal/ceres/solver_test.cc b/internal/ceres/solver_test.cc
index b4b2d34..92c1ddc 100644
--- a/internal/ceres/solver_test.cc
+++ b/internal/ceres/solver_test.cc
@@ -878,6 +878,20 @@
   options.dynamic_sparsity = false;
   options.use_mixed_precision_solves = true;
   EXPECT_FALSE(options.IsValid(&message));
+
+  options.sparse_linear_algebra_library_type = CUDA_SPARSE;
+  options.dynamic_sparsity = false;
+  options.use_mixed_precision_solves = false;
+  EXPECT_EQ(options.IsValid(&message),
+      IsSparseLinearAlgebraLibraryTypeAvailable(CUDA_SPARSE));
+
+  options.dynamic_sparsity = true;
+  options.use_mixed_precision_solves = false;
+  EXPECT_FALSE(options.IsValid(&message));
+
+  options.dynamic_sparsity = false;
+  options.use_mixed_precision_solves = true;
+  EXPECT_FALSE(options.IsValid(&message));
 }
 
 TEST(Solver, CgnrOptionsJacobiPreconditioner) {
@@ -940,6 +954,20 @@
   options.dynamic_sparsity = false;
   options.use_mixed_precision_solves = true;
   EXPECT_FALSE(options.IsValid(&message));
+
+  options.sparse_linear_algebra_library_type = CUDA_SPARSE;
+
+  options.dynamic_sparsity = false;
+  options.use_mixed_precision_solves = false;
+  EXPECT_FALSE(options.IsValid(&message));
+
+  options.dynamic_sparsity = true;
+  options.use_mixed_precision_solves = false;
+  EXPECT_FALSE(options.IsValid(&message));
+
+  options.dynamic_sparsity = false;
+  options.use_mixed_precision_solves = true;
+  EXPECT_FALSE(options.IsValid(&message));
 }
 
 TEST(Solver, CgnrOptionsSubsetPreconditioner) {
@@ -1012,6 +1040,19 @@
     options.use_mixed_precision_solves = true;
     EXPECT_FALSE(options.IsValid(&message));
   }
+
+  options.sparse_linear_algebra_library_type = CUDA_SPARSE;
+  options.dynamic_sparsity = false;
+  options.use_mixed_precision_solves = false;
+  EXPECT_FALSE(options.IsValid(&message));
+
+  options.dynamic_sparsity = true;
+  options.use_mixed_precision_solves = false;
+  EXPECT_FALSE(options.IsValid(&message));
+
+  options.dynamic_sparsity = false;
+  options.use_mixed_precision_solves = true;
+  EXPECT_FALSE(options.IsValid(&message));
 }
 
 TEST(Solver, CgnrOptionsSchurPreconditioners) {
diff --git a/internal/ceres/trust_region_preprocessor.cc b/internal/ceres/trust_region_preprocessor.cc
index d27a472..84e783f 100644
--- a/internal/ceres/trust_region_preprocessor.cc
+++ b/internal/ceres/trust_region_preprocessor.cc
@@ -261,6 +261,8 @@
   const Solver::Options& options = pp->options;
   pp->evaluator_options = Evaluator::Options();
   pp->evaluator_options.linear_solver_type = options.linear_solver_type;
+  pp->evaluator_options.sparse_linear_algebra_library_type =
+      options.sparse_linear_algebra_library_type;
   pp->evaluator_options.num_eliminate_blocks = 0;
   if (IsSchurType(options.linear_solver_type)) {
     pp->evaluator_options.num_eliminate_blocks =
diff --git a/internal/ceres/types.cc b/internal/ceres/types.cc
index 1d514de..3de91ed 100644
--- a/internal/ceres/types.cc
+++ b/internal/ceres/types.cc
@@ -108,6 +108,7 @@
     CASESTR(SUITE_SPARSE);
     CASESTR(EIGEN_SPARSE);
     CASESTR(ACCELERATE_SPARSE);
+    CASESTR(CUDA_SPARSE);
     CASESTR(NO_SPARSE);
     default:
       return "UNKNOWN";
@@ -120,6 +121,7 @@
   STRENUM(SUITE_SPARSE);
   STRENUM(EIGEN_SPARSE);
   STRENUM(ACCELERATE_SPARSE);
+  STRENUM(CUDA_SPARSE);
   STRENUM(NO_SPARSE);
   return false;
 }
@@ -420,6 +422,14 @@
 #endif
   }
 
+  if (type == CUDA_SPARSE) {
+#ifdef CERES_NO_CUDA
+    return false;
+#else
+    return true;
+#endif
+  }
+
   if (type == NO_SPARSE) {
     return true;
   }