Add support for dense CUDA solvers #2

1. Add CUDADenseQR & tests.
   CUDADenseQR uses the cuSolverDN LAPACK implementation
   of QR factorization. A key limitation, however, is that
   this solver does not perform singularity checking --
   this is because cuSolverDN does not have a trtrs
   implementation; we instead use cuBLAS' trsv for
   backsubstitution.
2. All CPU -> GPU memory transfers are now async, and both
   CUDADenseQR and CUDADenseCholesky explicitly manage their
   own streams for async operations.
3. Simplified CUDADenseCholesky to only use the legacy 32-bit
   cuSolverDN API.

Change-Id: I2a9b7b65469658ddfe33b5b2a3892c8744d6e437
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 48ba201..69432f7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -270,14 +270,6 @@
         "${CUDA_LIBRARIES};"
         "${CUDA_cusolver_LIBRARY};"
         "${CUDA_cusparse_LIBRARY}")
-    if (CUDA_VERSION VERSION_LESS 11.1)
-      message("-- CUDA version ${CUDA_VERSION} is older than 11.1, "
-          "using 32-bit API.")
-      add_definitions(-DCERES_CUDA_NO_64BIT_SOLVER_API)
-    else()
-      message("-- CUDA version ${CUDA_VERSION} is at least 11.1, "
-          "using 64-bit API.")
-    endif()
   else (CUDA_FOUND)
     message("-- Did not find CUDA library, disabling CUDA support.")
     update_cache_variable(CUDA OFF)
diff --git a/examples/nist.cc b/examples/nist.cc
index a3430ca..cb0d1aa0 100644
--- a/examples/nist.cc
+++ b/examples/nist.cc
@@ -101,7 +101,7 @@
               "and cgnr");
 DEFINE_string(dense_linear_algebra_library,
               "eigen",
-              "Options are: eigen and lapack.");
+              "Options are: eigen, lapack, and cuda.");
 DEFINE_string(preconditioner, "jacobi", "Options are: identity, jacobi");
 DEFINE_string(line_search,
               "wolfe",
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 166eb19..6c6c3c1 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -143,6 +143,7 @@
   list(APPEND
        CERES_LIBRARY_PRIVATE_DEPENDENCIES
        ${CUDA_LIBRARIES}
+       ${CUDA_cublas_LIBRARY}
        ${CUDA_cusolver_LIBRARY}
        ${CUDA_cusparse_LIBRARY})
 endif (CUDA_FOUND)
@@ -479,6 +480,7 @@
   ceres_test(covariance)
   ceres_test(cubic_interpolation)
   ceres_test(cuda_dense_cholesky)
+  ceres_test(cuda_dense_qr)
   ceres_test(dense_linear_solver)
   ceres_test(dense_cholesky)
   ceres_test(dense_qr)
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h
index 44e7de3..61f9752 100644
--- a/internal/ceres/cuda_buffer.h
+++ b/internal/ceres/cuda_buffer.h
@@ -45,7 +45,8 @@
 // the appropriate GPU device is selected before each subroutine is called. This
 // is particularly important when using multiple GPU devices on different CPU
 // threads, since active Cuda devices are determined by the cuda runtime on a
-// per-thread basis.
+// per-thread basis. Note that unless otherwise specified, all methods use the
+// default stream, and are synchronous.
 template <typename T>
 class CudaBuffer {
  public:
@@ -59,6 +60,8 @@
     }
   }
 
+  // Grow the GPU memory buffer if needed to accommodate data of the specified
+  // size
   void Reserve(const size_t size) {
     if (size > size_) {
       if (data_ != nullptr) {
@@ -69,12 +72,21 @@
     }
   }
 
-  void CopyToGpu(const T* data, const size_t size) {
+  // Perform an asynchronous copy from CPU memory to GPU memory using the stream
+  // provided.
+  void CopyToGpuAsync(const T* data, const size_t size, cudaStream_t stream) {
     Reserve(size);
-    CHECK_EQ(cudaMemcpy(data_, data, size * sizeof(T), cudaMemcpyHostToDevice),
+    CHECK_EQ(cudaMemcpyAsync(data_,
+                             data,
+                             size * sizeof(T),
+                             cudaMemcpyHostToDevice,
+                             stream),
              cudaSuccess);
   }
 
+  // Copy data from the GPU to CPU memory. This is necessarily synchronous since
+  // any potential GPU kernels that may be writing to the buffer must finish
+  // before the transfer happens.
   void CopyToHost(T* data, const size_t size) {
     CHECK(data_ != nullptr);
     CHECK_EQ(cudaMemcpy(data, data_, size * sizeof(T), cudaMemcpyDeviceToHost),
diff --git a/internal/ceres/cuda_dense_cholesky_test.cc b/internal/ceres/cuda_dense_cholesky_test.cc
index 7d39b0e..6c4dcc3 100644
--- a/internal/ceres/cuda_dense_cholesky_test.cc
+++ b/internal/ceres/cuda_dense_cholesky_test.cc
@@ -36,21 +36,19 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 
-using std::string;
-
 namespace ceres {
 namespace internal {
 
 #ifndef CERES_NO_CUDA
 
-TEST(DenseCholeskyCudaSolverOld, InvalidOptionOnCreate) {
+TEST(CUDADenseCholesky, InvalidOptionOnCreate) {
   LinearSolver::Options options;
-  auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+  auto dense_cuda_solver = CUDADenseCholesky::Create(options);
   EXPECT_EQ(dense_cuda_solver, nullptr);
 }
 
 // Tests the CUDA Cholesky solver with a simple 4x4 matrix.
-TEST(DenseCholeskyCudaSolverOld, Cholesky4x4Matrix) {
+TEST(CUDADenseCholesky, Cholesky4x4Matrix) {
   Eigen::Matrix4d A;
   A <<  4,  12, -16, 0,
        12,  37, -43, 0,
@@ -59,9 +57,9 @@
   const Eigen::Vector4d b = Eigen::Vector4d::Ones();
   LinearSolver::Options options;
   options.dense_linear_algebra_library_type = CUDA;
-  auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+  auto dense_cuda_solver = CUDADenseCholesky::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
-  string error_string;
+  std::string error_string;
   ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
                                         A.data(),
                                         &error_string),
@@ -75,7 +73,7 @@
   EXPECT_NEAR(x(3), 1.0000, std::numeric_limits<double>::epsilon() * 10);
 }
 
-TEST(DenseCholeskyCudaSolverOld, SingularMatrix) {
+TEST(CUDADenseCholesky, SingularMatrix) {
   Eigen::Matrix3d A;
   A <<  1, 0, 0,
         0, 1, 0,
@@ -83,16 +81,16 @@
   const Eigen::Vector3d b = Eigen::Vector3d::Ones();
   LinearSolver::Options options;
   options.dense_linear_algebra_library_type = CUDA;
-  auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+  auto dense_cuda_solver = CUDADenseCholesky::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
-  string error_string;
+  std::string error_string;
   ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
                                         A.data(),
                                         &error_string),
             LinearSolverTerminationType::LINEAR_SOLVER_FAILURE);
 }
 
-TEST(DenseCholeskyCudaSolverOld, NegativeMatrix) {
+TEST(CUDADenseCholesky, NegativeMatrix) {
   Eigen::Matrix3d A;
   A <<  1, 0, 0,
         0, 1, 0,
@@ -100,107 +98,26 @@
   const Eigen::Vector3d b = Eigen::Vector3d::Ones();
   LinearSolver::Options options;
   options.dense_linear_algebra_library_type = CUDA;
-  auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+  auto dense_cuda_solver = CUDADenseCholesky::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
-  string error_string;
+  std::string error_string;
   ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
                                         A.data(),
                                         &error_string),
             LinearSolverTerminationType::LINEAR_SOLVER_FAILURE);
 }
 
-TEST(DenseCholeskyCudaSolverOld, MustFactorizeBeforeSolve) {
+TEST(CUDADenseCholesky, MustFactorizeBeforeSolve) {
   const Eigen::Vector3d b = Eigen::Vector3d::Ones();
   LinearSolver::Options options;
   options.dense_linear_algebra_library_type = CUDA;
-  auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+  auto dense_cuda_solver = CUDADenseCholesky::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
-  string error_string;
+  std::string error_string;
   ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string),
             LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR);
 }
 
-#ifndef CERES_CUDA_NO_64BIT_SOLVER_API
-// TODO(sameeragarwal): Convert these tests into templated tests so that we use
-// the exact same code for both versions of the test.
-
-TEST(DenseCholeskyCudaSolverNew, InvalidOptionOnCreate) {
-  LinearSolver::Options options;
-  auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
-  EXPECT_EQ(dense_cuda_solver, nullptr);
-}
-
-TEST(DenseCholeskyCudaSolverNew, Cholesky4x4Matrix) {
-  Eigen::Matrix4d A;
-  A <<  4,  12, -16, 0,
-       12,  37, -43, 0,
-      -16, -43,  98, 0,
-        0,   0,   0, 1;
-  const Eigen::Vector4d b = Eigen::Vector4d::Ones();
-  LinearSolver::Options options;
-  options.dense_linear_algebra_library_type = CUDA;
-  auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
-  ASSERT_NE(dense_cuda_solver, nullptr);
-  string error_string;
-  ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
-                                        A.data(),
-                                        &error_string),
-            LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
-  Eigen::Vector4d x = Eigen::Vector4d::Zero();
-  ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string),
-            LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
-  EXPECT_NEAR(x(0), 113.75 / 3.0, std::numeric_limits<double>::epsilon() * 10);
-  EXPECT_NEAR(x(1), -31.0 / 3.0, std::numeric_limits<double>::epsilon() * 10);
-  EXPECT_NEAR(x(2), 5.0 / 3.0, std::numeric_limits<double>::epsilon() * 10);
-  EXPECT_NEAR(x(3), 1.0000, std::numeric_limits<double>::epsilon() * 10);
-}
-
-TEST(DenseCholeskyCudaSolverNew, SingularMatrix) {
-  Eigen::Matrix3d A;
-  A <<  1, 0, 0,
-        0, 1, 0,
-        0, 0, 0;
-  const Eigen::Vector3d b = Eigen::Vector3d::Ones();
-  LinearSolver::Options options;
-  options.dense_linear_algebra_library_type = CUDA;
-  auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
-  ASSERT_NE(dense_cuda_solver, nullptr);
-  string error_string;
-  ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
-                                        A.data(),
-                                        &error_string),
-            LinearSolverTerminationType::LINEAR_SOLVER_FAILURE);
-}
-
-TEST(DenseCholeskyCudaSolverNew, NegativeMatrix) {
-  Eigen::Matrix3d A;
-  A <<  1, 0, 0,
-        0, 1, 0,
-        0, 0, -1;
-  const Eigen::Vector3d b = Eigen::Vector3d::Ones();
-  LinearSolver::Options options;
-  options.dense_linear_algebra_library_type = CUDA;
-  auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
-  ASSERT_NE(dense_cuda_solver, nullptr);
-  string error_string;
-  ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
-                                        A.data(),
-                                        &error_string),
-            LinearSolverTerminationType::LINEAR_SOLVER_FAILURE);
-}
-
-TEST(DenseCholeskyCudaSolverNew, MustFactorizeBeforeSolve) {
-  const Eigen::Vector3d b = Eigen::Vector3d::Ones();
-  LinearSolver::Options options;
-  options.dense_linear_algebra_library_type = CUDA;
-  auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
-  ASSERT_NE(dense_cuda_solver, nullptr);
-  string error_string;
-  ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string),
-            LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR);
-}
-#endif  // CERES_CUDA_NO_64BIT_SOLVER_API
-
 #endif  // CERES_NO_CUDA
 
 }  // namespace internal
diff --git a/internal/ceres/cuda_dense_qr_test.cc b/internal/ceres/cuda_dense_qr_test.cc
new file mode 100644
index 0000000..15ba00a
--- /dev/null
+++ b/internal/ceres/cuda_dense_qr_test.cc
@@ -0,0 +1,164 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include <string>
+
+#include "ceres/dense_qr.h"
+#include "ceres/internal/eigen.h"
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace ceres {
+namespace internal {
+
+#ifndef CERES_NO_CUDA
+
+TEST(CUDADenseQR, InvalidOptionOnCreate) {
+  LinearSolver::Options options;
+  auto dense_cuda_solver = CUDADenseQR::Create(options);
+  EXPECT_EQ(dense_cuda_solver, nullptr);
+}
+
+// Tests the CUDA QR solver with a simple 4x4 matrix.
+TEST(CUDADenseQR, QR4x4Matrix) {
+  Eigen::Matrix4d A;
+  A <<  4,  12, -16, 0,
+       12,  37, -43, 0,
+      -16, -43,  98, 0,
+        0,   0,   0, 1;
+  const Eigen::Vector4d b = Eigen::Vector4d::Ones();
+  LinearSolver::Options options;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseQR::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  std::string error_string;
+  ASSERT_EQ(dense_cuda_solver->Factorize(A.rows(),
+                                         A.cols(),
+                                         A.data(),
+                                         &error_string),
+            LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
+  Eigen::Vector4d x = Eigen::Vector4d::Zero();
+  ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string),
+            LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
+  // Empirically observed accuracy of cuSolverDN's QR solver.
+  const double kEpsilon = 1e-11;
+  EXPECT_NEAR(x(0), 113.75 / 3.0, kEpsilon);
+  EXPECT_NEAR(x(1), -31.0 / 3.0, kEpsilon);
+  EXPECT_NEAR(x(2), 5.0 / 3.0, kEpsilon);
+  EXPECT_NEAR(x(3), 1.0000, kEpsilon);
+}
+
+// Tests the CUDA QR solver with a simple 4x4 matrix.
+TEST(CUDADenseQR, QR4x2Matrix) {
+  Eigen::Matrix<double, 4, 2> A;
+  A <<  4,  12,
+       12,  37,
+      -16, -43,
+        0,   0;
+  const std::vector<double> b(4, 1.0);
+  LinearSolver::Options options;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseQR::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  std::string error_string;
+  ASSERT_EQ(dense_cuda_solver->Factorize(A.rows(),
+                                         A.cols(),
+                                         A.data(),
+                                         &error_string),
+            LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
+  std::vector<double> x(2, 0);
+  ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string),
+            LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
+  // Empirically observed accuracy of cuSolverDN's QR solver.
+  const double kEpsilon = 1e-11;
+  // Solution values computed with Octave.
+  EXPECT_NEAR(x[0], -1.143410852713177, kEpsilon);
+  EXPECT_NEAR(x[1], 0.4031007751937981, kEpsilon);
+}
+
+TEST(CUDADenseQR, MustFactorizeBeforeSolve) {
+  const Eigen::Vector3d b = Eigen::Vector3d::Ones();
+  LinearSolver::Options options;
+  options.dense_linear_algebra_library_type = CUDA;
+  auto dense_cuda_solver = CUDADenseQR::Create(options);
+  ASSERT_NE(dense_cuda_solver, nullptr);
+  std::string error_string;
+  ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string),
+            LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR);
+}
+
+TEST(CUDADenseQR, Randomized1600x100Tests) {
+  const int kNumRows = 1600;
+  const int kNumCols = 100;
+  using LhsType = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>;
+  using RhsType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+  using SolutionType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+
+  LinearSolver::Options options;
+  options.dense_linear_algebra_library_type = ceres::CUDA;
+  std::unique_ptr<DenseQR> dense_qr = CUDADenseQR::Create(options);
+
+  const int kNumTrials = 100;
+  const int kMinNumCols = 1;
+  const int kMaxNumCols = 10;
+  const int kMinRowsFactor = 1;
+  const int kMaxRowsFactor = 3;
+  for (int i = 0; i < kNumTrials; ++i) {
+    LhsType lhs = LhsType::Random(kNumRows, kNumCols);
+    SolutionType x_expected = SolutionType::Random(kNumCols);
+    RhsType rhs = lhs * x_expected;
+    SolutionType x_computed = SolutionType::Zero(kNumCols);
+    // Sanity check the random matrix sizes.
+    EXPECT_EQ(lhs.rows(), kNumRows);
+    EXPECT_EQ(lhs.cols(), kNumCols);
+    EXPECT_EQ(rhs.rows(), kNumRows);
+    EXPECT_EQ(rhs.cols(), 1);
+    EXPECT_EQ(x_expected.rows(), kNumCols);
+    EXPECT_EQ(x_expected.cols(), 1);
+    EXPECT_EQ(x_computed.rows(), kNumCols);
+    EXPECT_EQ(x_computed.cols(), 1);
+    LinearSolver::Summary summary;
+    summary.termination_type = dense_qr->FactorAndSolve(kNumRows,
+                                                        kNumCols,
+                                                        lhs.data(),
+                                                        rhs.data(),
+                                                        x_computed.data(),
+                                                        &summary.message);
+    ASSERT_EQ(summary.termination_type, LINEAR_SOLVER_SUCCESS);
+    ASSERT_NEAR((x_computed - x_expected).norm() / x_expected.norm(),
+                0.0,
+                std::numeric_limits<double>::epsilon() * 400);
+  }
+}
+#endif  // CERES_NO_CUDA
+
+}  // namespace internal
+}  // namespace ceres
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index af555e7..2d8c2da 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc
@@ -193,13 +193,14 @@
 
 #ifndef CERES_NO_CUDA
 
-bool CUDADenseCholesky32Bit::Init(std::string* message) {
+bool CUDADenseCholesky::Init(std::string* message) {
   if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) {
     *message = "cuSolverDN::cusolverDnCreate failed.";
     return false;
   }
-  if (cudaStreamCreate(&stream_) != cudaSuccess) {
-    *message = "cuSolverDN::cudaStreamCreate failed.";
+  if (cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking) !=
+      cudaSuccess) {
+    *message = "CUDA::cudaStreamCreateWithFlags failed.";
     cusolverDnDestroy(cusolver_handle_);
     return false;
   }
@@ -211,26 +212,23 @@
     return false;
   }
   error_.Reserve(1);
-  *message = "CUDADenseCholesky64Bit::Init Success.";
+  *message = "CUDADenseCholesky::Init Success.";
   return true;
 }
 
-CUDADenseCholesky32Bit::~CUDADenseCholesky32Bit() {
+CUDADenseCholesky::~CUDADenseCholesky() {
   if (cusolver_handle_ != nullptr) {
     CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS);
     CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess);
   }
 }
 
-LinearSolverTerminationType CUDADenseCholesky32Bit::Factorize(
+LinearSolverTerminationType CUDADenseCholesky::Factorize(
     int num_cols, double* lhs, std::string* message) {
   factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
-  // Allocate GPU memory if necessary.
   lhs_.Reserve(num_cols * num_cols);
   num_cols_ = num_cols;
-  // Copy A to GPU.
-  lhs_.CopyToGpu(lhs, num_cols * num_cols);
-  // Allocate scratch space on GPU.
+  lhs_.CopyToGpuAsync(lhs, num_cols * num_cols, stream_);
   int device_workspace_size = 0;
   if (cusolverDnDpotrf_bufferSize(cusolver_handle_,
                                   CUBLAS_FILL_MODE_LOWER,
@@ -242,9 +240,7 @@
     *message = "cuSolverDN::cusolverDnDpotrf_bufferSize failed.";
     return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
   }
-  // Allocate GPU scratch memory.
   device_workspace_.Reserve(device_workspace_size);
-  // Run the actual factorization (potrf)
   if (cusolverDnDpotrf(cusolver_handle_,
                        CUBLAS_FILL_MODE_LOWER,
                        num_cols,
@@ -256,13 +252,12 @@
     *message = "cuSolverDN::cusolverDnDpotrf failed.";
     return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
   }
-  if (cudaDeviceSynchronize() != cudaSuccess || cudaStreamSynchronize(stream_),
-      cudaSuccess) {
+  if (cudaDeviceSynchronize() != cudaSuccess ||
+      cudaStreamSynchronize(stream_) != cudaSuccess) {
     *message = "Cuda device synchronization failed.";
     return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
   }
   int error = 0;
-  // Check for errors.
   error_.CopyToHost(&error, 1);
   if (error < 0) {
     LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
@@ -285,15 +280,13 @@
   return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
 }
 
-LinearSolverTerminationType CUDADenseCholesky32Bit::Solve(
+LinearSolverTerminationType CUDADenseCholesky::Solve(
     const double* rhs, double* solution, std::string* message) {
   if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) {
     *message = "Factorize did not complete succesfully previously.";
     return factorize_result_;
   }
-  // Copy RHS to GPU.
-  rhs_.CopyToGpu(rhs, num_cols_);
-  // Solve the system.
+  rhs_.CopyToGpuAsync(rhs, num_cols_, stream_);
   if (cusolverDnDpotrs(cusolver_handle_,
                        CUBLAS_FILL_MODE_LOWER,
                        num_cols_,
@@ -306,237 +299,41 @@
     *message = "cuSolverDN::cusolverDnDpotrs failed.";
     return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
   }
-  if (cudaDeviceSynchronize() != cudaSuccess || cudaStreamSynchronize(stream_),
-      cudaSuccess) {
+  if (cudaDeviceSynchronize() != cudaSuccess ||
+      cudaStreamSynchronize(stream_) != cudaSuccess) {
     *message = "Cuda device synchronization failed.";
     return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
   }
-  // Check for errors.
   int error = 0;
-  // Copy X from GPU to host.
-  rhs_.CopyToHost(solution, num_cols_);
+  error_.CopyToHost(&error, 1);
   if (error != 0) {
     LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
                << "Please report it."
                << "cuSolverDN::cusolverDnDpotrs fatal error. "
                << "Argument: " << -error << " is invalid.";
   }
-  // Copy error variable from GPU to host.
-  error_.CopyToHost(&error, 1);
-  *message = "Success";
-  return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
-}
-
-std::unique_ptr<CUDADenseCholesky32Bit> CUDADenseCholesky32Bit::Create(
-    const LinearSolver::Options& options) {
-  if (options.dense_linear_algebra_library_type != CUDA) {
-    // The user called the wrong factory method.
-    return nullptr;
-  }
-  CUDADenseCholesky32Bit* cuda_dense_cholesky = new CUDADenseCholesky32Bit();
-  std::string cuda_error;
-  if (cuda_dense_cholesky->Init(&cuda_error)) {
-    return std::unique_ptr<CUDADenseCholesky32Bit>(cuda_dense_cholesky);
-  }
-  // Initialization failed, destroy the object and return a nullptr.
-  delete cuda_dense_cholesky;
-  LOG(ERROR) << "CUDADenseCholesky32Bit::Init failed: " << cuda_error;
-  return std::unique_ptr<CUDADenseCholesky32Bit>(nullptr);
-}
-
-std::unique_ptr<CUDADenseCholesky64Bit> CUDADenseCholesky64Bit::Create(
-    const LinearSolver::Options& options) {
-  if (options.dense_linear_algebra_library_type != CUDA) {
-    // The user called the wrong factory method.
-    return nullptr;
-  }
-  CUDADenseCholesky64Bit* cuda_dense_cholesky = new CUDADenseCholesky64Bit();
-  std::string cuda_error;
-  if (cuda_dense_cholesky->Init(&cuda_error)) {
-    return std::unique_ptr<CUDADenseCholesky64Bit>(cuda_dense_cholesky);
-  }
-  // Initialization failed, destroy the object and return a nullptr.
-  delete cuda_dense_cholesky;
-  LOG(ERROR) << "CUDADenseCholesky64Bit::Init failed: " << cuda_error;
-  return std::unique_ptr<CUDADenseCholesky64Bit>(nullptr);
-}
-
-#ifdef CERES_CUDA_NO_64BIT_SOLVER_API
-// CUDA < 11.1 did not have the 64-bit APIs, so the implementation of the new
-// interface will just generate a fatal error.
-
-CUDADenseCholesky64Bit::~CUDADenseCholesky64Bit() {}
-
-bool CUDADenseCholesky64Bit::Init(std::string* message) {
-  *message = "Cannot use CUDADenseCholesky64Bit with CUDA < 11.1.";
-  return false;
-}
-
-LinearSolverTerminationType CUDADenseCholesky64Bit::Factorize(int,
-                                                              double*,
-                                                              std::string*) {
-  return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
-}
-
-LinearSolverTerminationType CUDADenseCholesky64Bit::Solve(const double*,
-                                                          double*,
-                                                          std::string*) {
-  return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
-}
-
-#else  // CERES_CUDA_NO_64BIT_SOLVER_API
-
-bool CUDADenseCholesky64Bit::Init(std::string* message) {
-  if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) {
-    *message = "cuSolverDN::cusolverDnCreate failed.";
-    return false;
-  }
-  if (cudaStreamCreate(&stream_) != cudaSuccess) {
-    *message = "cuSolverDN::cudaStreamCreate failed.";
-    cusolverDnDestroy(cusolver_handle_);
-    return false;
-  }
-  if (cusolverDnSetStream(cusolver_handle_, stream_) !=
-      CUSOLVER_STATUS_SUCCESS) {
-    *message = "cuSolverDN::cusolverDnSetStream failed.";
-    cudaStreamDestroy(stream_);
-    cusolverDnDestroy(cusolver_handle_);
-    return false;
-  }
-  error_.Reserve(1);
-  *message = "CUDADenseCholesky64Bit::Init Success.";
-  return true;
-}
-
-CUDADenseCholesky64Bit::~CUDADenseCholesky64Bit() {
-  if (cusolver_handle_ != nullptr) {
-    CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS);
-    CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess);
-  }
-}
-
-LinearSolverTerminationType CUDADenseCholesky64Bit::Factorize(
-    int num_cols, double* lhs, std::string* message) {
-  factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
-  // Allocate GPU memory if necessary.
-  lhs_.Reserve(num_cols * num_cols);
-  num_cols_ = num_cols;
-  // Copy A to GPU.
-  lhs_.CopyToGpu(lhs, num_cols * num_cols);
-  // Allocate scratch space on GPU.
-  size_t host_workspace_size = 0;
-  size_t device_workspace_size = 0;
-  if (cusolverDnXpotrf_bufferSize(cusolver_handle_,
-                                  nullptr,
-                                  CUBLAS_FILL_MODE_LOWER,
-                                  num_cols,
-                                  CUDA_R_64F,
-                                  lhs_.data(),
-                                  num_cols,
-                                  CUDA_R_64F,
-                                  &device_workspace_size,
-                                  &host_workspace_size) !=
-      CUSOLVER_STATUS_SUCCESS) {
-    *message = "cuSolverDN::cusolverDnXpotrf_bufferSize failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
-  }
-  // Allocate host scratch memory.
-  host_workspace_.resize(host_workspace_size);
-  // Allocate GPU scratch memory.
-  device_workspace_.Reserve(device_workspace_size);
-  // Run the actual factorization (potrf)
-  if (cusolverDnXpotrf(cusolver_handle_,
-                       nullptr,
-                       CUBLAS_FILL_MODE_LOWER,
-                       num_cols,
-                       CUDA_R_64F,
-                       lhs_.data(),
-                       num_cols,
-                       CUDA_R_64F,
-                       device_workspace_.data(),
-                       device_workspace_.size(),
-                       host_workspace_.data(),
-                       host_workspace_.size(),
-                       error_.data()) != CUSOLVER_STATUS_SUCCESS) {
-    *message = "cuSolverDN::cusolverDnXpotrf failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
-  }
-  if (cudaDeviceSynchronize() != cudaSuccess ||
-      cudaStreamSynchronize(stream_) != cudaSuccess) {
-    *message = "Cuda device synchronization failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
-  }
-  int error = 0;
-  // Check for errors.
-  error_.CopyToHost(&error, 1);
-  if (error < 0) {
-    LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
-               << "please report it. "
-               << "cuSolverDN::cusolverDnXpotrf fatal error. "
-               << "Argument: " << -error << " is invalid.";
-    // The following line is unreachable, but return failure just to be
-    // pedantic, since the compiler does not know that.
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
-  } else if (error > 0) {
-    *message = StringPrintf(
-        "cuSolverDN::cusolverDnXpotrf numerical failure. "
-        "The leading minor of order %d is not positive definite.",
-        error);
-    factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FAILURE;
-    return LinearSolverTerminationType::LINEAR_SOLVER_FAILURE;
-  }
-
-  *message = "Success";
-  factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
-  return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
-}
-
-LinearSolverTerminationType CUDADenseCholesky64Bit::Solve(
-    const double* rhs, double* solution, std::string* message) {
-  if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) {
-    *message = "Factorize did not complete succesfully previously.";
-    return factorize_result_;
-  }
-  // Copy RHS to GPU.
-  rhs_.CopyToGpu(rhs, num_cols_);
-  // Solve the system.
-  if (cusolverDnXpotrs(cusolver_handle_,
-                       nullptr,
-                       CUBLAS_FILL_MODE_LOWER,
-                       num_cols_,
-                       1,
-                       CUDA_R_64F,
-                       lhs_.data(),
-                       num_cols_,
-                       CUDA_R_64F,
-                       rhs_.data(),
-                       num_cols_,
-                       error_.data()) != CUSOLVER_STATUS_SUCCESS) {
-    *message = "cuSolverDN::cusolverDnXpotrs failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
-  }
-  if (cudaDeviceSynchronize() != cudaSuccess ||
-      cudaStreamSynchronize(stream_) != cudaSuccess) {
-    *message = "Cuda device synchronization failed.";
-    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
-  }
-  // Check for errors.
-  int error = 0;
-  // Copy error variable from GPU to host.
-  error_.CopyToHost(&error, 1);
-  if (error != 0) {
-    LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
-               << "Please report it. "
-               << "cuSolverDN::cusolverDnXpotrs fatal error. "
-               << "Argument: " << -error << " is invalid.";
-  }
-  // Copy X from GPU to host.
   rhs_.CopyToHost(solution, num_cols_);
   *message = "Success";
   return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
 }
 
-#endif  // CERES_CUDA_NO_64BIT_SOLVER_API
+std::unique_ptr<CUDADenseCholesky> CUDADenseCholesky::Create(
+    const LinearSolver::Options& options) {
+  if (options.dense_linear_algebra_library_type != CUDA) {
+    // The user called the wrong factory method.
+    return nullptr;
+  }
+  auto cuda_dense_cholesky =
+      std::unique_ptr<CUDADenseCholesky>(new CUDADenseCholesky());
+  std::string cuda_error;
+  if (cuda_dense_cholesky->Init(&cuda_error)) {
+    return cuda_dense_cholesky;
+  }
+  // Initialization failed, destroy the object (done automatically) and return a
+  // nullptr.
+  LOG(ERROR) << "CUDADenseCholesky::Init failed: " << cuda_error;
+  return nullptr;
+}
 
 #endif  // CERES_NO_CUDA
 
diff --git a/internal/ceres/dense_cholesky.h b/internal/ceres/dense_cholesky.h
index f7e48d4..49d780c 100644
--- a/internal/ceres/dense_cholesky.h
+++ b/internal/ceres/dense_cholesky.h
@@ -134,15 +134,15 @@
 #endif  // CERES_NO_LAPACK
 
 #ifndef CERES_NO_CUDA
-// Implementation of DenseCholesky using the cuSolver library v.11.0 or older,
-// using the legacy cuSolverDn interface.
-class CERES_NO_EXPORT CUDADenseCholesky32Bit : public DenseCholesky {
+// CUDA implementation of DenseCholesky using the cuSolverDN library using the
+// 32-bit legacy interface for maximum compatibility.
+class CERES_NO_EXPORT CUDADenseCholesky : public DenseCholesky {
  public:
-  static std::unique_ptr<CUDADenseCholesky32Bit> Create(
+  static std::unique_ptr<CUDADenseCholesky> Create(
       const LinearSolver::Options& options);
-  ~CUDADenseCholesky32Bit() override;
-  CUDADenseCholesky32Bit(const CUDADenseCholesky32Bit&) = delete;
-  CUDADenseCholesky32Bit& operator=(const CUDADenseCholesky32Bit&) = delete;
+  ~CUDADenseCholesky() override;
+  CUDADenseCholesky(const CUDADenseCholesky&) = delete;
+  CUDADenseCholesky& operator=(const CUDADenseCholesky&) = delete;
   LinearSolverTerminationType Factorize(int num_cols,
                                         double* lhs,
                                         std::string* message) override;
@@ -151,7 +151,7 @@
                                     std::string* message) override;
 
  private:
-  CUDADenseCholesky32Bit() = default;
+  CUDADenseCholesky() = default;
   // Initializes the cuSolverDN context, creates an asynchronous stream, and
   // associates the stream with cuSolverDN. Returns true iff initialization was
   // successful, else it returns false and a human-readable error message is
@@ -179,59 +179,6 @@
       LINEAR_SOLVER_FATAL_ERROR;
 };
 
-// Implementation of DenseCholesky using the cuSolver library v.11.1 or newer,
-// using the 64-bit cuSolverDn interface.
-class CERES_NO_EXPORT CUDADenseCholesky64Bit : public DenseCholesky {
- public:
-  static std::unique_ptr<CUDADenseCholesky64Bit> Create(
-      const LinearSolver::Options& options);
-  ~CUDADenseCholesky64Bit() override;
-  CUDADenseCholesky64Bit(const CUDADenseCholesky64Bit&) = delete;
-  CUDADenseCholesky64Bit& operator=(const CUDADenseCholesky64Bit&) = delete;
-  LinearSolverTerminationType Factorize(int num_cols,
-                                        double* lhs,
-                                        std::string* message) override;
-  LinearSolverTerminationType Solve(const double* rhs,
-                                    double* solution,
-                                    std::string* message) override;
-
- private:
-  CUDADenseCholesky64Bit() = default;
-  // Initializes the cuSolverDN context, creates an asynchronous stream, and
-  // associates the stream with cuSolverDN. Returns true iff initialization was
-  // successful, else it returns false and a human-readable error message is
-  // returned.
-  bool Init(std::string* message);
-
-  // Handle to the cuSOLVER context.
-  cusolverDnHandle_t cusolver_handle_ = nullptr;
-  // CUDA device stream.
-  cudaStream_t stream_ = nullptr;
-  // Number of columns in the A matrix, to be cached between calls to Factorize
-  // and Solve.
-  size_t num_cols_ = 0;
-  // GPU memory allocated for the A matrix (lhs matrix).
-  CudaBuffer<double> lhs_;
-  // GPU memory allocated for the B matrix (rhs vector).
-  CudaBuffer<double> rhs_;
-  // Workspace for cuSOLVER on the GPU.
-  CudaBuffer<uint8_t> device_workspace_;
-  // Workspace for cuSOLVER on the host.
-  std::vector<double> host_workspace_;
-  // Required for error handling with cuSOLVER.
-  CudaBuffer<int> error_;
-  // Cache the result of Factorize to ensure that when Solve is called, the
-  // factiorization of lhs is valid.
-  LinearSolverTerminationType factorize_result_ =
-      LINEAR_SOLVER_FATAL_ERROR;
-};
-
-#ifdef CERES_CUDA_NO_64BIT_SOLVER_API
-using CUDADenseCholesky = CUDADenseCholesky32Bit;
-#else
-using CUDADenseCholesky = CUDADenseCholesky64Bit;
-#endif
-
 #endif  // CERES_NO_CUDA
 
 }  // namespace internal
diff --git a/internal/ceres/dense_linear_solver_benchmark.cc b/internal/ceres/dense_linear_solver_benchmark.cc
index 2af9fa8..ba171bf 100644
--- a/internal/ceres/dense_linear_solver_benchmark.cc
+++ b/internal/ceres/dense_linear_solver_benchmark.cc
@@ -98,7 +98,9 @@
 #endif  // CERES_NO_LAPACK
 
 #ifndef CERES_NO_CUDA
-BENCHMARK(BM_DenseSolver<ceres::CUDA, ceres::DENSE_NORMAL_CHOLESKY>)
+BENCHMARK_TEMPLATE2(BM_DenseSolver, ceres::CUDA, ceres::DENSE_NORMAL_CHOLESKY)
+    ->Apply(MatrixSizes);
+BENCHMARK_TEMPLATE2(BM_DenseSolver, ceres::CUDA, ceres::DENSE_QR)
     ->Apply(MatrixSizes);
 #endif  // CERES_NO_CUDA
 
diff --git a/internal/ceres/dense_qr.cc b/internal/ceres/dense_qr.cc
index 33a7cbd..77f04ad 100644
--- a/internal/ceres/dense_qr.cc
+++ b/internal/ceres/dense_qr.cc
@@ -33,6 +33,10 @@
 #include <algorithm>
 #include <memory>
 #include <string>
+#ifndef CERES_NO_CUDA
+#include "cusolverDn.h"
+#include "cublas_v2.h"
+#endif  // CERES_NO_CUDA
 
 #ifndef CERES_NO_LAPACK
 
@@ -124,6 +128,14 @@
       LOG(FATAL) << "Ceres was compiled without support for LAPACK.";
 #endif
 
+    case CUDA:
+#ifndef CERES_NO_CUDA
+      dense_qr = CUDADenseQR::Create(options);
+      break;
+#else
+      LOG(FATAL) << "Ceres was compiled without support for CUDA.";
+#endif
+
     default:
       LOG(FATAL) << "Unknown dense linear algebra library type : "
                  << DenseLinearAlgebraLibraryTypeToString(
@@ -296,5 +308,207 @@
 
 #endif  // CERES_NO_LAPACK
 
+#ifndef CERES_NO_CUDA
+
+bool CUDADenseQR::Init(std::string* message) {
+  if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
+    *message = "cuBLAS::cublasCreate failed.";
+    return false;
+  }
+  if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnCreate failed.";
+    return false;
+  }
+  if (cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking) !=
+      cudaSuccess) {
+    *message = "CUDA::cudaStreamCreateWithFlags failed.";
+    cusolverDnDestroy(cusolver_handle_);
+    cublasDestroy(cublas_handle_);
+    return false;
+  }
+  if (cusolverDnSetStream(cusolver_handle_, stream_) != CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnSetStream failed.";
+    cusolverDnDestroy(cusolver_handle_);
+    cudaStreamDestroy(stream_);
+    cublasDestroy(cublas_handle_);
+    return false;
+  }
+  if (cublasSetStream(cublas_handle_, stream_) != CUBLAS_STATUS_SUCCESS) {
+    *message = "cuBLAS::cublasSetStream failed.";
+    cusolverDnDestroy(cusolver_handle_);
+    cublasDestroy(cublas_handle_);
+    cudaStreamDestroy(stream_);
+    return false;
+  }
+  error_.Reserve(1);
+  *message = "CUDADenseQR::Init Success.";
+  return true;
+}
+
+CUDADenseQR::~CUDADenseQR() {
+  if (cublas_handle_ != nullptr) {
+    CHECK_EQ(cublasDestroy(cublas_handle_), CUBLAS_STATUS_SUCCESS);
+  }
+  if (cusolver_handle_ != nullptr) {
+    CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS);
+  }
+  if (stream_ != nullptr) {
+    CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess);
+  }
+}
+
+LinearSolverTerminationType CUDADenseQR::Factorize(
+    int num_rows, int num_cols, double* lhs, std::string* message) {
+  factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  lhs_.Reserve(num_rows * num_cols);
+  tau_.Reserve(std::min(num_rows, num_cols));
+  num_rows_ = num_rows;
+  num_cols_ = num_cols;
+  lhs_.CopyToGpuAsync(lhs, num_rows * num_cols, stream_);
+  int device_workspace_size = 0;
+  if (cusolverDnDgeqrf_bufferSize(cusolver_handle_,
+                                  num_rows,
+                                  num_cols,
+                                  lhs_.data(),
+                                  num_rows,
+                                  &device_workspace_size) !=
+      CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnDgeqrf_bufferSize failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  device_workspace_.Reserve(device_workspace_size);
+  if (cusolverDnDgeqrf(cusolver_handle_,
+                       num_rows,
+                       num_cols,
+                       lhs_.data(),
+                       num_rows,
+                       tau_.data(),
+                       reinterpret_cast<double*>(device_workspace_.data()),
+                       device_workspace_.size(),
+                       error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnDgeqrf failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  if (cudaDeviceSynchronize() != cudaSuccess ||
+      cudaStreamSynchronize(stream_) != cudaSuccess) {
+    *message = "Cuda device synchronization failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  int error = 0;
+  error_.CopyToHost(&error, 1);
+  if (error < 0) {
+    LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
+               << "please report it. "
+               << "cuSolverDN::cusolverDnDgeqrf fatal error. "
+               << "Argument: " << -error << " is invalid.";
+    // The following line is unreachable, but return failure just to be
+    // pedantic, since the compiler does not know that.
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+
+  *message = "Success";
+  factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+  return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+}
+
+LinearSolverTerminationType CUDADenseQR::Solve(
+    const double* rhs, double* solution, std::string* message) {
+  if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) {
+    *message = "Factorize did not complete succesfully previously.";
+    return factorize_result_;
+  }
+  rhs_.CopyToGpuAsync(rhs, num_rows_, stream_);
+  int device_workspace_size = 0;
+  if (cusolverDnDormqr_bufferSize(cusolver_handle_,
+                                  CUBLAS_SIDE_LEFT,
+                                  CUBLAS_OP_T,
+                                  num_rows_,
+                                  1,
+                                  num_cols_,
+                                  lhs_.data(),
+                                  num_rows_,
+                                  tau_.data(),
+                                  rhs_.data(),
+                                  num_rows_,
+                                  &device_workspace_size) !=
+      CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnDormqr_bufferSize failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  device_workspace_.Reserve(device_workspace_size);
+  // Compute rhs = Q^T * rhs, assuming that lhs has already been factorized.
+  // The result of factorization would have stored Q in a packed form in lhs_.
+  if (cusolverDnDormqr(cusolver_handle_,
+                       CUBLAS_SIDE_LEFT,
+                       CUBLAS_OP_T,
+                       num_rows_,
+                       1,
+                       num_cols_,
+                       lhs_.data(),
+                       num_rows_,
+                       tau_.data(),
+                       rhs_.data(),
+                       num_rows_,
+                       reinterpret_cast<double*>(device_workspace_.data()),
+                       device_workspace_.size(),
+                       error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnDormqr failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  int error = 0;
+  error_.CopyToHost(&error, 1);
+  if (error < 0) {
+    LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
+               << "Please report it."
+               << "cuSolverDN::cusolverDnDormqr fatal error. "
+               << "Argument: " << -error << " is invalid.";
+  }
+  // Compute the solution vector as x = R \ (Q^T * rhs). Since the previous step
+  // replaced rhs by (Q^T * rhs), this is just x = R \ rhs.
+  if (cublasDtrsv(cublas_handle_,
+                  CUBLAS_FILL_MODE_UPPER,
+                  CUBLAS_OP_N,
+                  CUBLAS_DIAG_NON_UNIT,
+                  num_cols_,
+                  lhs_.data(),
+                  num_rows_,
+                  rhs_.data(),
+                  1) != CUBLAS_STATUS_SUCCESS) {
+    *message = "cuBLAS::cublasDtrsv failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  if (cudaDeviceSynchronize() != cudaSuccess ||
+      cudaStreamSynchronize(stream_) != cudaSuccess) {
+    *message = "Cuda device synchronization failed.";
+    return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+  }
+  rhs_.CopyToHost(solution, num_cols_);
+  *message = "Success";
+  return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+}
+
+std::unique_ptr<CUDADenseQR> CUDADenseQR::Create(
+    const LinearSolver::Options& options) {
+  if (options.dense_linear_algebra_library_type != CUDA) {
+    // The user called the wrong factory method.
+    return nullptr;
+  }
+  auto cuda_dense_qr =
+      std::unique_ptr<CUDADenseQR>(new CUDADenseQR());
+  std::string cuda_error;
+  if (cuda_dense_qr->Init(&cuda_error)) {
+    return cuda_dense_qr;
+  }
+  // Initialization failed, destroy the object (done automatically) and return a
+  // nullptr.
+  LOG(ERROR) << "CUDADenseQR::Init failed: " << cuda_error;
+  return nullptr;
+}
+
+CUDADenseQR::CUDADenseQR()  = default;
+
+#endif  // CERES_NO_CUDA
+
+
 }  // namespace internal
 }  // namespace ceres
diff --git a/internal/ceres/dense_qr.h b/internal/ceres/dense_qr.h
index aafb067..8bcccf0 100644
--- a/internal/ceres/dense_qr.h
+++ b/internal/ceres/dense_qr.h
@@ -45,6 +45,12 @@
 #include "ceres/internal/export.h"
 #include "ceres/linear_solver.h"
 #include "glog/logging.h"
+#ifndef CERES_NO_CUDA
+#include "ceres/cuda_buffer.h"
+#include "cuda_runtime.h"
+#include "cublas_v2.h"
+#include "cusolverDn.h"
+#endif  // CERES_NO_CUDA
 
 namespace ceres {
 namespace internal {
@@ -137,6 +143,65 @@
 };
 #endif  // CERES_NO_LAPACK
 
+#ifndef CERES_NO_CUDA
+// Implementation of DenseQR using the 32-bit cuSolverDn interface. A
+// requirement for using this solver is that the lhs must not be rank deficient.
+// This is because cuSolverDn does not implement the singularity-checking
+// wrapper trtrs, hence this solver directly uses trsv from CUBLAS for the
+// backsubstitution.
+class CERES_NO_EXPORT CUDADenseQR : public DenseQR {
+ public:
+  static std::unique_ptr<CUDADenseQR> Create(
+      const LinearSolver::Options& options);
+  ~CUDADenseQR() override;
+  CUDADenseQR(const CUDADenseQR&) = delete;
+  CUDADenseQR& operator=(const CUDADenseQR&) = delete;
+  LinearSolverTerminationType Factorize(int num_rows,
+                                        int num_cols,
+                                        double* lhs,
+                                        std::string* message) override;
+  LinearSolverTerminationType Solve(const double* rhs,
+                                    double* solution,
+                                    std::string* message) override;
+
+ private:
+  CUDADenseQR();
+  // Initializes the cuSolverDN context, creates an asynchronous stream, and
+  // associates the stream with cuSolverDN. Returns true iff initialization was
+  // successful, else it returns false and a human-readable error message is
+  // returned.
+  bool Init(std::string* message);
+
+  // Handle to the cuSOLVER context.
+  cusolverDnHandle_t cusolver_handle_ = nullptr;
+  // Handle to cuBLAS context.
+  cublasHandle_t cublas_handle_ = nullptr;
+  // CUDA device stream.
+  cudaStream_t stream_ = nullptr;
+  // Number of rowns in the A matrix, to be cached between calls to *Factorize
+  // and *Solve.
+  size_t num_rows_ = 0;
+  // Number of columns in the A matrix, to be cached between calls to *Factorize
+  // and *Solve.
+  size_t num_cols_ = 0;
+  // GPU memory allocated for the A matrix (lhs matrix).
+  CudaBuffer<double> lhs_;
+  // GPU memory allocated for the B matrix (rhs vector).
+  CudaBuffer<double> rhs_;
+  // GPU memory allocated for the TAU matrix (scaling of householder vectors).
+  CudaBuffer<double> tau_;
+  // Scratch space for cuSOLVER on the GPU.
+  CudaBuffer<uint8_t> device_workspace_;
+  // Required for error handling with cuSOLVER.
+  CudaBuffer<int> error_;
+  // Cache the result of Factorize to ensure that when Solve is called, the
+  // factiorization of lhs is valid.
+  LinearSolverTerminationType factorize_result_ =
+      LINEAR_SOLVER_FATAL_ERROR;
+};
+
+#endif  // CERES_NO_CUDA
+
 }  // namespace internal
 }  // namespace ceres
 
diff --git a/internal/ceres/dense_qr_test.cc b/internal/ceres/dense_qr_test.cc
index ac1258d..d9d307e 100644
--- a/internal/ceres/dense_qr_test.cc
+++ b/internal/ceres/dense_qr_test.cc
@@ -33,6 +33,7 @@
 #include <memory>
 #include <numeric>
 #include <string>
+#include <tuple>
 #include <vector>
 
 #include "Eigen/Dense"
@@ -67,6 +68,7 @@
 
   LinearSolver::Options options;
   options.dense_linear_algebra_library_type = GetParam();
+  const double kEpsilon = std::numeric_limits<double>::epsilon() * 1.5e4;
   std::unique_ptr<DenseQR> dense_qr = DenseQR::Create(options);
 
   const int kNumTrials = 10;
@@ -83,7 +85,6 @@
         Vector x = VectorType::Random(num_cols);
         Vector rhs = lhs * x;
         Vector actual = Vector::Random(num_cols);
-
         LinearSolver::Summary summary;
         summary.termination_type = dense_qr->FactorAndSolve(num_rows,
                                                             num_cols,
@@ -91,10 +92,8 @@
                                                             rhs.data(),
                                                             actual.data(),
                                                             &summary.message);
-        EXPECT_EQ(summary.termination_type, LINEAR_SOLVER_SUCCESS);
-        EXPECT_NEAR((x - actual).norm() / x.norm(),
-                    0.0,
-                    std::numeric_limits<double>::epsilon() * 400)
+        ASSERT_EQ(summary.termination_type, LINEAR_SOLVER_SUCCESS);
+        ASSERT_NEAR((x - actual).norm() / x.norm(), 0.0, kEpsilon)
             << "\nexpected: " << x.transpose()
             << "\nactual  : " << actual.transpose();
       }
@@ -102,17 +101,26 @@
   }
 }
 
+namespace {
+
+// NOTE: preprocessor directives in a macro are not standard conforming
+decltype(auto) MakeValues() {
+  return ::testing::Values(
+      EIGEN
 #ifndef CERES_NO_LAPACK
-INSTANTIATE_TEST_SUITE_P(_,
-                         DenseQRTest,
-                         ::testing::Values(EIGEN, LAPACK),
-                         ParamInfoToString);
-#else
-INSTANTIATE_TEST_SUITE_P(_,
-                         DenseQRTest,
-                         ::testing::Values(EIGEN),
-                         ParamInfoToString);
+      ,
+      LAPACK
 #endif
+#ifndef CERES_NO_CUDA
+      ,
+      CUDA
+#endif
+  );
+}
+
+}  // namespace
+
+INSTANTIATE_TEST_SUITE_P(_, DenseQRTest, MakeValues(), ParamInfoToString);
 
 }  // namespace internal
 }  // namespace ceres