Add support for dense CUDA solvers #1

1. Add CUDADenseCholesky64Bit, CUDADenseCholesky32Bit, & tests.
   CUDADenseCholesky32Bit uses the legacy versions of potrf/potrs
   in cuSolverDN, while CUDADenseCholesky64Bit uses the new 64-bit
   versions available since Cuda 11.1. The legacy versions are
   provided since some platforms such as the Nvidia Jetsons only
   support Cuda 10.2.
2. Expose CUDA as a new option under DenseLinearAlgebraLibraryType.
   The relevant option to string and string to option helper functions
   are modified accordingly.
3. Add cuda as a dense_linear_algebra_library option in bundle_adjuster
   to demonstrate the use of the new CUDA option.

Change-Id: I23615e1d301df5185ed646b3e33ee802508dae86
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 68d99c9..5254b73 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -154,6 +154,7 @@
   option(ACCELERATESPARSE
     "Enable use of sparse solvers in Apple's Accelerate framework." ON)
 endif()
+option(CUDA "Enable use of CUDA linear algebra solvers." ON)
 option(LAPACK "Enable use of LAPACK directly within Ceres." ON)
 # Template specializations for the Schur complement based solvers. If
 # compile time, binary size or compiler performance is an issue, you
@@ -260,6 +261,31 @@
   endif (EIGENSPARSE)
 endif (Eigen3_FOUND)
 
+if (CUDA)
+  find_package(CUDA QUIET)
+  if (CUDA_FOUND)
+    message("-- Found CUDA version ${CUDA_VERSION}: "
+        "${CUDA_LIBRARIES};"
+        "${CUDA_cusolver_LIBRARY};"
+        "${CUDA_cusparse_LIBRARY}")
+    if (CUDA_VERSION VERSION_LESS 11.1)
+      message("-- CUDA version ${CUDA_VERSION} is older than 11.1, "
+          "using 32-bit API.")
+      add_definitions(-DCERES_CUDA_NO_64BIT_SOLVER_API)
+    else()
+      message("-- CUDA version ${CUDA_VERSION} is at least 11.1, "
+          "using 64-bit API.")
+    endif()
+  else (CUDA_FOUND)
+    message("-- Did not find CUDA library, disabling CUDA support.")
+    update_cache_variable(CUDA OFF)
+    list(APPEND CERES_COMPILE_OPTIONS CERES_NO_CUDA)
+  endif (CUDA_FOUND)
+else (CUDA)
+  message("-- Building without CUDA.")
+  list(APPEND CERES_COMPILE_OPTIONS CERES_NO_CUDA)
+endif (CUDA)
+
 if (LAPACK)
   find_package(LAPACK QUIET)
   if (LAPACK_FOUND)
diff --git a/cmake/config.h.in b/cmake/config.h.in
index 4a516f6..7022f67 100644
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -53,6 +53,9 @@
 // If defined, Ceres was compiled without CXSparse.
 @CERES_NO_CXSPARSE@
 
+// If defined, Ceres was compiled without CUDA.
+@CERES_NO_CUDA@
+
 // If defined, Ceres was compiled without Apple's Accelerate framework solvers.
 @CERES_NO_ACCELERATE_SPARSE@
 
diff --git a/examples/bundle_adjuster.cc b/examples/bundle_adjuster.cc
index ef2dbc4..1940cf2 100644
--- a/examples/bundle_adjuster.cc
+++ b/examples/bundle_adjuster.cc
@@ -93,7 +93,7 @@
 DEFINE_string(sparse_linear_algebra_library, "suite_sparse",
               "Options are: suite_sparse and cx_sparse.");
 DEFINE_string(dense_linear_algebra_library, "eigen",
-              "Options are: eigen and lapack.");
+              "Options are: eigen, lapack, and cuda");
 DEFINE_string(ordering, "automatic", "Options are: automatic, user.");
 
 DEFINE_bool(use_quaternions, false, "If true, uses quaternions to represent "
diff --git a/include/ceres/types.h b/include/ceres/types.h
index 5ee6fdc..3a38805 100644
--- a/include/ceres/types.h
+++ b/include/ceres/types.h
@@ -186,6 +186,7 @@
 enum DenseLinearAlgebraLibraryType {
   EIGEN,
   LAPACK,
+  CUDA,
 };
 
 // Logging options
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 80703a1..7a8bd41 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -218,6 +218,14 @@
   list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ${AccelerateSparse_LIBRARIES})
 endif()
 
+if (CUDA_FOUND)
+  list(APPEND
+       CERES_LIBRARY_PRIVATE_DEPENDENCIES
+       ${CUDA_LIBRARIES}
+       ${CUDA_cusolver_LIBRARY}
+       ${CUDA_cusparse_LIBRARY})
+endif (CUDA_FOUND)
+
 if (LAPACK_FOUND)
   list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ${LAPACK_LIBRARIES})
 endif ()
@@ -362,6 +370,10 @@
   list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES_INCLUDE_DIRS
     ${AccelerateSparse_INCLUDE_DIRS})
 endif()
+if (CUDA)
+  list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES_INCLUDE_DIRS
+    ${CUDA_INCLUDE_DIRS})
+endif()
 # Add include locations for optional dependencies to the Ceres target without
 # duplication.
 list(REMOVE_DUPLICATES CERES_LIBRARY_PRIVATE_DEPENDENCIES_INCLUDE_DIRS)
@@ -453,6 +465,7 @@
   ceres_test(cost_function_to_functor)
   ceres_test(covariance)
   ceres_test(cubic_interpolation)
+  ceres_test(cuda_dense_cholesky)
   ceres_test(dense_linear_solver)
   ceres_test(dense_cholesky)
   ceres_test(dense_qr)
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h
new file mode 100644
index 0000000..2d983af
--- /dev/null
+++ b/internal/ceres/cuda_buffer.h
@@ -0,0 +1,98 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#ifndef CERES_INTERNAL_CUDA_BUFFER_H_
+#define CERES_INTERNAL_CUDA_BUFFER_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include <vector>
+
+#include "cuda_runtime.h"
+#include "glog/logging.h"
+
+// An encapsulated buffer to maintain GPU memory, and handle transfers between
+// GPU and system memory. It is the responsibility of the user to ensure that
+// the appropriate GPU device is selected before each subroutine is called. This
+// is particularly important when using multiple GPU devices on different CPU
+// threads, since active Cuda devices are determined by the cuda runtime on a
+// per-thread basis.
+template <typename T>
+class CudaBuffer {
+ public:
+  CudaBuffer() {}
+  CudaBuffer(const CudaBuffer&) = delete;
+  CudaBuffer& operator=(const CudaBuffer&) = delete;
+
+  ~CudaBuffer() {
+    if (data_ != nullptr) {
+      CHECK_EQ(cudaFree(data_), cudaSuccess);
+    }
+  }
+
+  void Reserve(const size_t size) {
+    if (size > size_) {
+      if (data_ != nullptr) {
+        CHECK_EQ(cudaFree(data_), cudaSuccess);
+      }
+      CHECK_EQ(cudaMalloc(&data_, size * sizeof(T)), cudaSuccess);
+      size_ = size;
+    }
+  }
+
+  void CopyToGpu(const T* data, const size_t size) {
+    Reserve(size);
+    CHECK_EQ(cudaMemcpy(data_, data, size * sizeof(T), cudaMemcpyHostToDevice),
+             cudaSuccess);
+  }
+
+  void CopyToHost(T* data, const size_t size) {
+    CHECK(data_ != nullptr);
+    CHECK_EQ(cudaMemcpy(data, data_, size * sizeof(T), cudaMemcpyDeviceToHost),
+             cudaSuccess);
+  }
+
+  void CopyToGpu(const std::vector<T>& data) {
+    CopyToGpu(data.data(), data.size());
+  }
+
+  T* data() { return data_; }
+  size_t size() const { return size_; }
+
+ private:
+  T* data_ = nullptr;
+  size_t size_ = 0;
+};
+
+#endif  // CERES_NO_CUDA
+
+#endif  // CERES_INTERNAL_CUDA_BUFFER_H_
\ No newline at end of file
diff --git a/internal/ceres/cuda_dense_cholesky_test.cc b/internal/ceres/cuda_dense_cholesky_test.cc
new file mode 100644
index 0000000..7d39b0e
--- /dev/null
+++ b/internal/ceres/cuda_dense_cholesky_test.cc
@@ -0,0 +1,207 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include <string>
+
+#include "ceres/dense_cholesky.h"
+#include "ceres/internal/eigen.h"
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+using std::string;
+
+namespace ceres {
+namespace internal {
+
+#ifndef CERES_NO_CUDA
+
+TEST(DenseCholeskyCudaSolverOld, InvalidOptionOnCreate) {
+  LinearSolver::Options options;
+  auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+  EXPECT_EQ(dense_cuda_solver, nullptr);
+}
+
+// Tests the CUDA Cholesky solver with a simple 4x4 matrix.
+TEST(DenseCholeskyCudaSolverOld, Cholesky4x4Matrix) {
+  Eigen::Matrix4d A;
+  A <<  4,  12, -16, 0,
+       12,  37, -43, 0,
+      -16, -43,  98, 0,
+        0,   0,   0, 1;
+  const Eigen::Vector4d b = Eigen::Vector4d::Ones();
+  LinearSolver::Options options;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  string error_string;
+  ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
+                                        A.data(),
+                                        &error_string),
+            LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
+  Eigen::Vector4d x = Eigen::Vector4d::Zero();
+  ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string),
+            LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
+  EXPECT_NEAR(x(0), 113.75 / 3.0, std::numeric_limits<double>::epsilon() * 10);
+  EXPECT_NEAR(x(1), -31.0 / 3.0, std::numeric_limits<double>::epsilon() * 10);
+  EXPECT_NEAR(x(2), 5.0 / 3.0, std::numeric_limits<double>::epsilon() * 10);
+  EXPECT_NEAR(x(3), 1.0000, std::numeric_limits<double>::epsilon() * 10);
+}
+
+TEST(DenseCholeskyCudaSolverOld, SingularMatrix) {
+  Eigen::Matrix3d A;
+  A <<  1, 0, 0,
+        0, 1, 0,
+        0, 0, 0;
+  const Eigen::Vector3d b = Eigen::Vector3d::Ones();
+  LinearSolver::Options options;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  string error_string;
+  ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
+                                        A.data(),
+                                        &error_string),
+            LinearSolverTerminationType::LINEAR_SOLVER_FAILURE);
+}
+
+TEST(DenseCholeskyCudaSolverOld, NegativeMatrix) {
+  Eigen::Matrix3d A;
+  A <<  1, 0, 0,
+        0, 1, 0,
+        0, 0, -1;
+  const Eigen::Vector3d b = Eigen::Vector3d::Ones();
+  LinearSolver::Options options;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  string error_string;
+  ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
+                                        A.data(),
+                                        &error_string),
+            LinearSolverTerminationType::LINEAR_SOLVER_FAILURE);
+}
+
+TEST(DenseCholeskyCudaSolverOld, MustFactorizeBeforeSolve) {
+  const Eigen::Vector3d b = Eigen::Vector3d::Ones();
+  LinearSolver::Options options;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  string error_string;
+  ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string),
+            LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR);
+}
+
+#ifndef CERES_CUDA_NO_64BIT_SOLVER_API
+// TODO(sameeragarwal): Convert these tests into templated tests so that we use
+// the exact same code for both versions of the test.
+
+TEST(DenseCholeskyCudaSolverNew, InvalidOptionOnCreate) {
+  LinearSolver::Options options;
+  auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
+  EXPECT_EQ(dense_cuda_solver, nullptr);
+}
+
+TEST(DenseCholeskyCudaSolverNew, Cholesky4x4Matrix) {
+  Eigen::Matrix4d A;
+  A <<  4,  12, -16, 0,
+       12,  37, -43, 0,
+      -16, -43,  98, 0,
+        0,   0,   0, 1;
+  const Eigen::Vector4d b = Eigen::Vector4d::Ones();
+  LinearSolver::Options options;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  string error_string;
+  ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
+                                        A.data(),
+                                        &error_string),
+            LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
+  Eigen::Vector4d x = Eigen::Vector4d::Zero();
+  ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string),
+            LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
+  EXPECT_NEAR(x(0), 113.75 / 3.0, std::numeric_limits<double>::epsilon() * 10);
+  EXPECT_NEAR(x(1), -31.0 / 3.0, std::numeric_limits<double>::epsilon() * 10);
+  EXPECT_NEAR(x(2), 5.0 / 3.0, std::numeric_limits<double>::epsilon() * 10);
+  EXPECT_NEAR(x(3), 1.0000, std::numeric_limits<double>::epsilon() * 10);
+}
+
+TEST(DenseCholeskyCudaSolverNew, SingularMatrix) {
+  Eigen::Matrix3d A;
+  A <<  1, 0, 0,
+        0, 1, 0,
+        0, 0, 0;
+  const Eigen::Vector3d b = Eigen::Vector3d::Ones();
+  LinearSolver::Options options;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  string error_string;
+  ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
+                                        A.data(),
+                                        &error_string),
+            LinearSolverTerminationType::LINEAR_SOLVER_FAILURE);
+}
+
+TEST(DenseCholeskyCudaSolverNew, NegativeMatrix) {
+  Eigen::Matrix3d A;
+  A <<  1, 0, 0,
+        0, 1, 0,
+        0, 0, -1;
+  const Eigen::Vector3d b = Eigen::Vector3d::Ones();
+  LinearSolver::Options options;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  string error_string;
+  ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
+                                        A.data(),
+                                        &error_string),
+            LinearSolverTerminationType::LINEAR_SOLVER_FAILURE);
+}
+
+TEST(DenseCholeskyCudaSolverNew, MustFactorizeBeforeSolve) {
+  const Eigen::Vector3d b = Eigen::Vector3d::Ones();
+  LinearSolver::Options options;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  string error_string;
+  ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string),
+            LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR);
+}
+#endif  // CERES_CUDA_NO_64BIT_SOLVER_API
+
+#endif  // CERES_NO_CUDA
+
+}  // namespace internal
+}  // namespace ceres
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index f83fdf1..6070ea0 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc
@@ -33,6 +33,12 @@
 #include <algorithm>
 #include <memory>
 #include <string>
+#include <vector>
+
+#ifndef CERES_NO_CUDA
+#include "cuda_runtime.h"
+#include "cusolverDn.h"
+#endif  // CERES_NO_CUDA
 
 #ifndef CERES_NO_LAPACK
 
@@ -70,6 +76,14 @@
       LOG(FATAL) << "Ceres was compiled without support for LAPACK.";
 #endif
 
+    case CUDA:
+#ifndef CERES_NO_CUDA
+      dense_cholesky = CUDADenseCholesky::Create(options);
+      break;
+#else
+      LOG(FATAL) << "Ceres was compiled without support for CUDA.";
+#endif
+
     default:
       LOG(FATAL) << "Unknown dense linear algebra library type : "
                  << DenseLinearAlgebraLibraryTypeToString(
@@ -175,5 +189,362 @@
 
 #endif  // CERES_NO_LAPACK
 
+#ifndef CERES_NO_CUDA
+
+bool CUDADenseCholesky32Bit::Init(std::string* message) {
+  if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnCreate failed.";
+    return false;
+  }
+  if (cudaStreamCreate(&stream_) != cudaSuccess) {
+    *message = "cuSolverDN::cudaStreamCreate failed.";
+    cusolverDnDestroy(cusolver_handle_);
+    return false;
+  }
+  if (cusolverDnSetStream(cusolver_handle_, stream_) != CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnSetStream failed.";
+    cudaStreamDestroy(stream_);
+    cusolverDnDestroy(cusolver_handle_);
+    return false;
+  }
+  error_.Reserve(1);
+  *message = "CUDADenseCholesky64Bit::Init Success.";
+  return true;
+}
+
+CUDADenseCholesky32Bit::~CUDADenseCholesky32Bit() {
+  if (cusolver_handle_ != nullptr) {
+    CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS);
+    CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess);
+  }
+}
+
+LinearSolverTerminationType CUDADenseCholesky32Bit::Factorize(
+    int num_cols,
+    double* lhs,
+    std::string* message) {
+  factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  // Allocate GPU memory if necessary.
+  lhs_.Reserve(num_cols * num_cols);
+  num_cols_ = num_cols;
+  // Copy A to GPU.
+  lhs_.CopyToGpu(lhs, num_cols * num_cols);
+  // Allocate scratch space on GPU.
+  int device_workspace_size = 0;
+  if (cusolverDnDpotrf_bufferSize(cusolver_handle_,
+                                      CUBLAS_FILL_MODE_LOWER,
+                                      num_cols,
+                                      lhs_.data(),
+                                      num_cols,
+                                      &device_workspace_size) !=
+      CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnDpotrf_bufferSize failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  // Allocate GPU scratch memory.
+  device_workspace_.Reserve(device_workspace_size);
+  // Run the actual factorization (potrf)
+  if (cusolverDnDpotrf(cusolver_handle_,
+                            CUBLAS_FILL_MODE_LOWER,
+                            num_cols,
+                            lhs_.data(),
+                            num_cols,
+                            reinterpret_cast<double*>(device_workspace_.data()),
+                            device_workspace_.size(),
+                            error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnDpotrf failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  if (cudaDeviceSynchronize() != cudaSuccess ||
+      cudaStreamSynchronize(stream_), cudaSuccess) {
+    *message = "Cuda device synchronization failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  int error = 0;
+  // Check for errors.
+  error_.CopyToHost(&error, 1);
+  if (error < 0) {
+    LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
+               << "please report it. "
+               << "cuSolverDN::cusolverDnXpotrf fatal error."
+               << "Argument: " << -error << " is invalid.";
+    // The following line is unreachable, but return failure just to be
+    // pedantic, since the compiler does not know that.
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  } else if (error > 0) {
+    *message = StringPrintf(
+        "cuSolverDN::cusolverDnDpotrf numerical failure. "
+        "The leading minor of order %d is not positive definite.",
+        error);
+    factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FAILURE;
+    return LinearSolverTerminationType::LINEAR_SOLVER_FAILURE;
+  }
+  *message = "Success";
+  factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+  return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+}
+
+LinearSolverTerminationType CUDADenseCholesky32Bit::Solve(
+    const double* rhs,
+    double* solution,
+    std::string* message) {
+  if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) {
+    *message = "Factorize did not complete succesfully previously.";
+    return factorize_result_;
+  }
+  // Copy RHS to GPU.
+  rhs_.CopyToGpu(rhs, num_cols_);
+  // Solve the system.
+  if (cusolverDnDpotrs(cusolver_handle_,
+                            CUBLAS_FILL_MODE_LOWER,
+                            num_cols_,
+                            1,
+                            lhs_.data(),
+                            num_cols_,
+                            rhs_.data(),
+                            num_cols_,
+                            error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnDpotrs failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  if (cudaDeviceSynchronize() != cudaSuccess ||
+      cudaStreamSynchronize(stream_), cudaSuccess) {
+    *message = "Cuda device synchronization failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  // Check for errors.
+  int error = 0;
+  // Copy X from GPU to host.
+  rhs_.CopyToHost(solution, num_cols_);
+  if (error != 0) {
+    LOG(FATAL) << "Congratulations, you found a bug in Ceres."
+               << "Please report it."
+               << "cuSolverDN::cusolverDnDpotrs fatal error."
+               << "Argument: " << -error << " is invalid.";
+  }
+  // Copy error variable from GPU to host.
+  error_.CopyToHost(&error, 1);
+  *message = "Success";
+  return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+}
+
+std::unique_ptr<CUDADenseCholesky32Bit> CUDADenseCholesky32Bit::Create(
+  const LinearSolver::Options& options) {
+  if (options.dense_linear_algebra_library_type != CUDA) {
+    // The user called the wrong factory method.
+    return nullptr;
+  }
+  CUDADenseCholesky32Bit* cuda_dense_cholesky = new CUDADenseCholesky32Bit();
+  std::string cuda_error;
+  if (cuda_dense_cholesky->Init(&cuda_error)) {
+    return std::unique_ptr<CUDADenseCholesky32Bit>(cuda_dense_cholesky);
+  }
+  // Initialization failed, destroy the object and return a nullptr.
+  delete cuda_dense_cholesky;
+  LOG(ERROR) << "CUDADenseCholesky32Bit::Init failed: " << cuda_error;
+  return std::unique_ptr<CUDADenseCholesky32Bit>(nullptr);
+}
+
+std::unique_ptr<CUDADenseCholesky64Bit> CUDADenseCholesky64Bit::Create(
+  const LinearSolver::Options& options) {
+  if (options.dense_linear_algebra_library_type != CUDA) {
+    // The user called the wrong factory method.
+    return nullptr;
+  }
+  CUDADenseCholesky64Bit* cuda_dense_cholesky = new CUDADenseCholesky64Bit();
+  std::string cuda_error;
+  if (cuda_dense_cholesky->Init(&cuda_error)) {
+    return std::unique_ptr<CUDADenseCholesky64Bit>(cuda_dense_cholesky);
+  }
+  // Initialization failed, destroy the object and return a nullptr.
+  delete cuda_dense_cholesky;
+  LOG(ERROR) << "CUDADenseCholesky64Bit::Init failed: " << cuda_error;
+  return std::unique_ptr<CUDADenseCholesky64Bit>(nullptr);
+}
+
+#ifdef CERES_CUDA_NO_64BIT_SOLVER_API
+// CUDA < 11.1 did not have the 64-bit APIs, so the implementation of the new
+// interface will just generate a fatal error.
+
+CUDADenseCholesky64Bit::~CUDADenseCholesky64Bit() {}
+
+bool CUDADenseCholesky64Bit::Init(std::string* message) {
+  *message = "Cannot use CUDADenseCholesky64Bit with CUDA < 11.1.";
+  return false;
+}
+
+LinearSolverTerminationType CUDADenseCholesky64Bit::Factorize(
+    int,
+    double*,
+    std::string*) {
+  return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+}
+
+LinearSolverTerminationType CUDADenseCholesky64Bit::Solve(
+    const double*,
+    double*,
+    std::string*) {
+  return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+}
+
+#else  // CERES_CUDA_NO_64BIT_SOLVER_API
+
+bool CUDADenseCholesky64Bit::Init(std::string* message) {
+  if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnCreate failed.";
+    return false;
+  }
+  if (cudaStreamCreate(&stream_) != cudaSuccess) {
+    *message = "cuSolverDN::cudaStreamCreate failed.";
+    cusolverDnDestroy(cusolver_handle_);
+    return false;
+  }
+  if (cusolverDnSetStream(cusolver_handle_, stream_) != CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnSetStream failed.";
+    cudaStreamDestroy(stream_);
+    cusolverDnDestroy(cusolver_handle_);
+    return false;
+  }
+  error_.Reserve(1);
+  *message = "CUDADenseCholesky64Bit::Init Success.";
+  return true;
+}
+
+CUDADenseCholesky64Bit::~CUDADenseCholesky64Bit() {
+  if (cusolver_handle_ != nullptr) {
+    CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS);
+    CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess);
+  }
+}
+
+LinearSolverTerminationType CUDADenseCholesky64Bit::Factorize(
+    int num_cols,
+    double* lhs,
+    std::string* message) {
+  factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  // Allocate GPU memory if necessary.
+  lhs_.Reserve(num_cols * num_cols);
+  num_cols_ = num_cols;
+  // Copy A to GPU.
+  lhs_.CopyToGpu(lhs, num_cols * num_cols);
+  // Allocate scratch space on GPU.
+  size_t host_workspace_size = 0;
+  size_t device_workspace_size = 0;
+  if (cusolverDnXpotrf_bufferSize(cusolver_handle_,
+                                      nullptr,
+                                      CUBLAS_FILL_MODE_LOWER,
+                                      num_cols,
+                                      CUDA_R_64F,
+                                      lhs_.data(),
+                                      num_cols,
+                                      CUDA_R_64F,
+                                      &device_workspace_size,
+                                      &host_workspace_size) !=
+      CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnXpotrf_bufferSize failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  // Allocate host scratch memory.
+  host_workspace_.resize(host_workspace_size);
+  // Allocate GPU scratch memory.
+  device_workspace_.Reserve(device_workspace_size);
+  // Run the actual factorization (potrf)
+  if (cusolverDnXpotrf(cusolver_handle_,
+                            nullptr,
+                            CUBLAS_FILL_MODE_LOWER,
+                            num_cols,
+                            CUDA_R_64F,
+                            lhs_.data(),
+                            num_cols,
+                            CUDA_R_64F,
+                            device_workspace_.data(),
+                            device_workspace_.size(),
+                            host_workspace_.data(),
+                            host_workspace_.size(),
+                            error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnXpotrf failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  if (cudaDeviceSynchronize() != cudaSuccess ||
+      cudaStreamSynchronize(stream_) != cudaSuccess) {
+    *message = "Cuda device synchronization failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  int error = 0;
+  // Check for errors.
+  error_.CopyToHost(&error, 1);
+  if (error < 0) {
+    LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
+               << "please report it. "
+               << "cuSolverDN::cusolverDnXpotrf fatal error."
+               << "Argument: " << -error << " is invalid.";
+    // The following line is unreachable, but return failure just to be
+    // pedantic, since the compiler does not know that.
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  } else if (error > 0) {
+    *message = StringPrintf(
+        "cuSolverDN::cusolverDnXpotrf numerical failure. "
+        "The leading minor of order %d is not positive definite.",
+        error);
+    factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FAILURE;
+    return LinearSolverTerminationType::LINEAR_SOLVER_FAILURE;
+  }
+
+  *message = "Success";
+  factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+  return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+}
+
+LinearSolverTerminationType CUDADenseCholesky64Bit::Solve(
+    const double* rhs,
+    double* solution,
+    std::string* message) {
+  if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) {
+    *message = "Factorize did not complete succesfully previously.";
+    return factorize_result_;
+  }
+  // Copy RHS to GPU.
+  rhs_.CopyToGpu(rhs, num_cols_);
+  // Solve the system.
+  if (cusolverDnXpotrs(cusolver_handle_,
+                            nullptr,
+                            CUBLAS_FILL_MODE_LOWER,
+                            num_cols_,
+                            1,
+                            CUDA_R_64F,
+                            lhs_.data(),
+                            num_cols_,
+                            CUDA_R_64F,
+                            rhs_.data(),
+                            num_cols_,
+                            error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnXpotrs failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  if (cudaDeviceSynchronize() != cudaSuccess ||
+      cudaStreamSynchronize(stream_) != cudaSuccess) {
+    *message = "Cuda device synchronization failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  // Check for errors.
+  int error = 0;
+  // Copy error variable from GPU to host.
+  error_.CopyToHost(&error, 1);
+  if (error != 0) {
+    LOG(FATAL) << "Congratulations, you found a bug in Ceres."
+               << "Please report it."
+               << "cuSolverDN::cusolverDnXpotrs fatal error."
+               << "Argument: " << -error << " is invalid.";
+  }
+  // Copy X from GPU to host.
+  rhs_.CopyToHost(solution, num_cols_);
+  *message = "Success";
+  return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+}
+
+#endif // CERES_CUDA_NO_64BIT_SOLVER_API
+
+#endif  // CERES_NO_CUDA
+
 }  // namespace internal
 }  // namespace ceres
diff --git a/internal/ceres/dense_cholesky.h b/internal/ceres/dense_cholesky.h
index 268862d..470998a 100644
--- a/internal/ceres/dense_cholesky.h
+++ b/internal/ceres/dense_cholesky.h
@@ -37,10 +37,16 @@
 // clang-format on
 
 #include <memory>
+#include <vector>
 
 #include "Eigen/Dense"
+#include "ceres/cuda_buffer.h"
 #include "ceres/linear_solver.h"
 #include "glog/logging.h"
+#ifndef CERES_NO_CUDA
+#include "cuda_runtime.h"
+#include "cusolverDn.h"
+#endif  // CERES_NO_CUDA
 
 namespace ceres {
 namespace internal {
@@ -129,6 +135,107 @@
 };
 #endif  // CERES_NO_LAPACK
 
+#ifndef CERES_NO_CUDA
+// Implementation of DenseCholesky using the cuSolver library v.11.0 or older,
+// using the legacy cuSolverDn interface.
+class CERES_EXPORT_INTERNAL CUDADenseCholesky32Bit : public DenseCholesky {
+ public:
+  static std::unique_ptr<CUDADenseCholesky32Bit> Create(
+      const LinearSolver::Options& options);
+  ~CUDADenseCholesky32Bit() override;
+  CUDADenseCholesky32Bit(const CUDADenseCholesky32Bit&) = delete;
+  CUDADenseCholesky32Bit& operator=(const CUDADenseCholesky32Bit&) = delete;
+  LinearSolverTerminationType Factorize(int num_cols,
+                                        double* lhs,
+                                        std::string* message) override;
+  LinearSolverTerminationType Solve(const double* rhs,
+                                    double* solution,
+                                    std::string* message) override;
+
+ private:
+  CUDADenseCholesky32Bit() {}
+  // Initializes the cuSolverDN context, creates an asynchronous stream, and
+  // associates the stream with cuSolverDN. Returns true iff initialization was
+  // successful, else it returns false and a human-readable error message is
+  // returned.
+  bool Init(std::string* message);
+
+  // Handle to the cuSOLVER context.
+  cusolverDnHandle_t cusolver_handle_ = nullptr;
+  // CUDA device stream.
+  cudaStream_t stream_ = nullptr;
+  // Number of columns in the A matrix, to be cached between calls to *Factorize
+  // and *Solve.
+  size_t num_cols_ = 0;
+  // GPU memory allocated for the A matrix (lhs matrix).
+  CudaBuffer<double> lhs_;
+  // GPU memory allocated for the B matrix (rhs vector).
+  CudaBuffer<double> rhs_;
+  // Scratch space for cuSOLVER on the GPU.
+  CudaBuffer<uint8_t> device_workspace_;
+  // Required for error handling with cuSOLVER.
+  CudaBuffer<int> error_;
+  // Cache the result of Factorize to ensure that when Solve is called, the
+  // factiorization of lhs is valid.
+  LinearSolverTerminationType factorize_result_ =
+      LINEAR_SOLVER_FATAL_ERROR;
+};
+
+// Implementation of DenseCholesky using the cuSolver library v.11.1 or newer,
+// using the 64-bit cuSolverDn interface.
+class CERES_EXPORT_INTERNAL CUDADenseCholesky64Bit : public DenseCholesky {
+ public:
+  static std::unique_ptr<CUDADenseCholesky64Bit> Create(
+      const LinearSolver::Options& options);
+  ~CUDADenseCholesky64Bit() override;
+  CUDADenseCholesky64Bit(const CUDADenseCholesky64Bit&) = delete;
+  CUDADenseCholesky64Bit& operator=(const CUDADenseCholesky64Bit&) = delete;
+  LinearSolverTerminationType Factorize(int num_cols,
+                                        double* lhs,
+                                        std::string* message) override;
+  LinearSolverTerminationType Solve(const double* rhs,
+                                    double* solution,
+                                    std::string* message) override;
+
+ private:
+  CUDADenseCholesky64Bit() {}
+  // Initializes the cuSolverDN context, creates an asynchronous stream, and
+  // associates the stream with cuSolverDN. Returns true iff initialization was
+  // successful, else it returns false and a human-readable error message is
+  // returned.
+  bool Init(std::string* message);
+
+  // Handle to the cuSOLVER context.
+  cusolverDnHandle_t cusolver_handle_ = nullptr;
+  // CUDA device stream.
+  cudaStream_t stream_ = nullptr;
+  // Number of columns in the A matrix, to be cached between calls to Factorize
+  // and Solve.
+  size_t num_cols_ = 0;
+  // GPU memory allocated for the A matrix (lhs matrix).
+  CudaBuffer<double> lhs_;
+  // GPU memory allocated for the B matrix (rhs vector).
+  CudaBuffer<double> rhs_;
+  // Workspace for cuSOLVER on the GPU.
+  CudaBuffer<uint8_t> device_workspace_;
+  // Workspace for cuSOLVER on the host.
+  std::vector<double> host_workspace_;
+  // Required for error handling with cuSOLVER.
+  CudaBuffer<int> error_;
+  // Cache the result of Factorize to ensure that when Solve is called, the
+  // factiorization of lhs is valid.
+  LinearSolverTerminationType factorize_result_ =
+      LINEAR_SOLVER_FATAL_ERROR;
+};
+
+#ifdef CERES_CUDA_NO_64BIT_SOLVER_API
+using CUDADenseCholesky = CUDADenseCholesky32Bit;
+#else
+using CUDADenseCholesky = CUDADenseCholesky64Bit;
+#endif
+
+#endif  // CERES_NO_CUDA
+
 }  // namespace internal
 }  // namespace ceres
 
diff --git a/internal/ceres/dense_cholesky_test.cc b/internal/ceres/dense_cholesky_test.cc
index eda4f10..4141e79 100644
--- a/internal/ceres/dense_cholesky_test.cc
+++ b/internal/ceres/dense_cholesky_test.cc
@@ -92,17 +92,20 @@
   }
 }
 
+INSTANTIATE_TEST_SUITE_P(
+    _,
+    DenseCholeskyTest,
+    ::testing::Values(
+        EIGEN,
 #ifndef CERES_NO_LAPACK
-INSTANTIATE_TEST_SUITE_P(_,
-                         DenseCholeskyTest,
-                         ::testing::Values(EIGEN, LAPACK),
-                         ParamInfoToString);
-#else
-INSTANTIATE_TEST_SUITE_P(_,
-                         DenseCholeskyTest,
-                         ::testing::Values(EIGEN),
-                         ParamInfoToString);
+        LAPACK,
 #endif
+#ifndef CERES_NO_CUDA
+        CUDA
+#endif
+    ),
+    ParamInfoToString);
+
 
 }  // namespace internal
 }  // namespace ceres
diff --git a/internal/ceres/dense_linear_solver_benchmark.cc b/internal/ceres/dense_linear_solver_benchmark.cc
index b8f1e00..2af9fa8 100644
--- a/internal/ceres/dense_linear_solver_benchmark.cc
+++ b/internal/ceres/dense_linear_solver_benchmark.cc
@@ -95,7 +95,12 @@
     ->Apply(MatrixSizes);
 BENCHMARK_TEMPLATE2(BM_DenseSolver, ceres::LAPACK, ceres::DENSE_NORMAL_CHOLESKY)
     ->Apply(MatrixSizes);
-#endif
+#endif  // CERES_NO_LAPACK
+
+#ifndef CERES_NO_CUDA
+BENCHMARK(BM_DenseSolver<ceres::CUDA, ceres::DENSE_NORMAL_CHOLESKY>)
+    ->Apply(MatrixSizes);
+#endif  // CERES_NO_CUDA
 
 }  // namespace internal
 }  // namespace ceres
diff --git a/internal/ceres/solver_utils.cc b/internal/ceres/solver_utils.cc
index eb5aafa..1622e0f 100644
--- a/internal/ceres/solver_utils.cc
+++ b/internal/ceres/solver_utils.cc
@@ -36,6 +36,9 @@
 #include "ceres/internal/config.h"
 #include "ceres/internal/port.h"
 #include "ceres/version.h"
+#ifndef CERES_NO_CUDA
+#include "cuda_runtime.h"
+#endif  // CERES_NO_CUDA
 
 namespace ceres {
 namespace internal {
@@ -87,6 +90,10 @@
   value += "-no_custom_blas";
 #endif
 
+#ifndef CERES_NO_CUDA
+  value += "-cuda-(" + std::to_string(CUDART_VERSION) + ")";
+#endif
+
   return value;
 }
 
diff --git a/internal/ceres/types.cc b/internal/ceres/types.cc
index 39bb2d8..bf99568 100644
--- a/internal/ceres/types.cc
+++ b/internal/ceres/types.cc
@@ -128,6 +128,7 @@
   switch (type) {
     CASESTR(EIGEN);
     CASESTR(LAPACK);
+    CASESTR(CUDA);
     default:
       return "UNKNOWN";
   }
@@ -138,6 +139,7 @@
   UpperCase(&value);
   STRENUM(EIGEN);
   STRENUM(LAPACK);
+  STRENUM(CUDA);
   return false;
 }