CUDA Cleanup

* All Cuda* objects now take in a ContextImpl* during
  construction, and save the context instead of individual
  handles.
* Since we no longer use the legacy default stream, we need to
  explicitly synchronize the stream before performing GPU->CPU
  transfers, and CudaBuffer is responsible for such synchronization
  when asked to perform GPU to CPU transfers.
* Remove all manual syncs and relegate syncing to CudaBuffer
  before performing GPU to CPU transfers.

Change-Id: Ic73cb24174a1e09842827323280e90241716cc20
diff --git a/internal/ceres/context_impl.h b/internal/ceres/context_impl.h
index ee0c589..d8f294a 100644
--- a/internal/ceres/context_impl.h
+++ b/internal/ceres/context_impl.h
@@ -81,6 +81,13 @@
   // 3. If the user explicitly selects a GPU in the host process before calling
   //    Ceres, Ceres will use that GPU.
 
+  // Note on Ceres' use of CUDA Streams:
+  // All operations on the GPU are performed using a single stream. This ensures
+  // that the order of operations are stream-ordered, but we do not need to
+  // explicitly synchronize the stream at the end of every operation. Stream
+  // synchronization occurs only before GPU to CPU transfers, and is handled by
+  // CudaBuffer.
+
   // Initializes cuBLAS, cuSOLVER, and cuSPARSE contexts, creates an
   // asynchronous CUDA stream, and associates the stream with the contexts.
   // Returns true iff initialization was successful, else it returns false and a
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h
index dba1706..97d126c 100644
--- a/internal/ceres/cuda_buffer.h
+++ b/internal/ceres/cuda_buffer.h
@@ -31,6 +31,7 @@
 #ifndef CERES_INTERNAL_CUDA_BUFFER_H_
 #define CERES_INTERNAL_CUDA_BUFFER_H_
 
+#include "ceres/context_impl.h"
 #include "ceres/internal/config.h"
 
 #ifndef CERES_NO_CUDA
@@ -40,6 +41,7 @@
 #include "cuda_runtime.h"
 #include "glog/logging.h"
 
+namespace ceres::internal {
 // An encapsulated buffer to maintain GPU memory, and handle transfers between
 // GPU and system memory. It is the responsibility of the user to ensure that
 // the appropriate GPU device is selected before each subroutine is called. This
@@ -49,7 +51,10 @@
 template <typename T>
 class CudaBuffer {
  public:
-  CudaBuffer() = default;
+  explicit CudaBuffer(ContextImpl* context) : context_(context) {}
+  CudaBuffer(ContextImpl* context, int size) : context_(context) {
+    Reserve(size);
+  }
   CudaBuffer(const CudaBuffer&) = delete;
   CudaBuffer& operator=(const CudaBuffer&) = delete;
 
@@ -75,51 +80,61 @@
 
   // Perform an asynchronous copy from CPU memory to GPU memory managed by this
   // CudaBuffer instance using the stream provided.
-  void CopyFromCpu(const T* data, const size_t size, cudaStream_t stream) {
+  void CopyFromCpu(const T* data, const size_t size) {
     Reserve(size);
-    CHECK_EQ(cudaMemcpyAsync(
-                 data_, data, size * sizeof(T), cudaMemcpyHostToDevice, stream),
+    CHECK_EQ(cudaMemcpyAsync(data_,
+                             data,
+                             size * sizeof(T),
+                             cudaMemcpyHostToDevice,
+                             context_->stream_),
              cudaSuccess);
   }
 
   // Perform an asynchronous copy from a vector in CPU memory to GPU memory
   // managed by this CudaBuffer instance.
-  void CopyFromCpuVector(const std::vector<T>& data, cudaStream_t stream) {
+  void CopyFromCpuVector(const std::vector<T>& data) {
     Reserve(data.size());
     CHECK_EQ(cudaMemcpyAsync(data_,
                              data.data(),
                              data.size() * sizeof(T),
                              cudaMemcpyHostToDevice,
-                             stream),
+                             context_->stream_),
              cudaSuccess);
   }
 
   // Perform an asynchronous copy from another GPU memory array to the GPU
   // memory managed by this CudaBuffer instance using the stream provided.
-  void CopyFromGPUArray(const T* data, const size_t size, cudaStream_t stream) {
+  void CopyFromGPUArray(const T* data, const size_t size) {
     Reserve(size);
-    CHECK_EQ(
-        cudaMemcpyAsync(
-            data_, data, size * sizeof(T), cudaMemcpyDeviceToDevice, stream),
-        cudaSuccess);
+    CHECK_EQ(cudaMemcpyAsync(data_,
+                             data,
+                             size * sizeof(T),
+                             cudaMemcpyDeviceToDevice,
+                             context_->stream_),
+             cudaSuccess);
   }
 
   // Copy data from the GPU memory managed by this CudaBuffer instance to CPU
   // memory. It is the caller's responsibility to ensure that the CPU memory
   // pointer is valid, i.e. it is not null, and that it points to memory of
-  // at least this->size() size. This copy is necessarily synchronous since any
-  // potential GPU kernels that may be writing to the buffer must finish before
-  // the transfer happens.
+  // at least this->size() size. This method ensures all previously dispatched
+  // GPU operations on the specified stream have completed before copying the
+  // data to CPU memory.
   void CopyToCpu(T* data, const size_t size) const {
     CHECK(data_ != nullptr);
-    CHECK_EQ(cudaMemcpy(data, data_, size * sizeof(T), cudaMemcpyDeviceToHost),
+    CHECK_EQ(cudaMemcpyAsync(data,
+                             data_,
+                             size * sizeof(T),
+                             cudaMemcpyDeviceToHost,
+                             context_->stream_),
              cudaSuccess);
+    CHECK_EQ(cudaStreamSynchronize(context_->stream_), cudaSuccess);
   }
 
   // Copy N items from another GPU memory array to the GPU memory managed by
   // this CudaBuffer instance, growing this buffer's size if needed. This copy
   // is asynchronous, and operates on the stream provided.
-  void CopyNItemsFrom(int n, const CudaBuffer<T>& other, cudaStream_t stream) {
+  void CopyNItemsFrom(int n, const CudaBuffer<T>& other) {
     Reserve(n);
     CHECK(other.data_ != nullptr);
     CHECK(data_ != nullptr);
@@ -127,7 +142,7 @@
                              other.data_,
                              size_ * sizeof(T),
                              cudaMemcpyDeviceToDevice,
-                             stream),
+                             context_->stream_),
              cudaSuccess);
   }
 
@@ -141,7 +156,9 @@
  private:
   T* data_ = nullptr;
   size_t size_ = 0;
+  ContextImpl* context_ = nullptr;
 };
+}  // namespace ceres::internal
 
 #endif  // CERES_NO_CUDA
 
diff --git a/internal/ceres/cuda_kernels_test.cc b/internal/ceres/cuda_kernels_test.cc
index 053b442..9290aa5 100644
--- a/internal/ceres/cuda_kernels_test.cc
+++ b/internal/ceres/cuda_kernels_test.cc
@@ -36,6 +36,7 @@
 #include <string>
 #include <vector>
 
+#include "ceres/context_impl.h"
 #include "ceres/cuda_buffer.h"
 #include "ceres/internal/config.h"
 #include "ceres/internal/eigen.h"
@@ -48,10 +49,13 @@
 #ifndef CERES_NO_CUDA
 
 TEST(CudaFP64ToFP32, SimpleConversions) {
+  ContextImpl context;
+  std::string cuda_error;
+  EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
   std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
-  CudaBuffer<double> fp64_gpu;
-  fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault);
-  CudaBuffer<float> fp32_gpu;
+  CudaBuffer<double> fp64_gpu(&context);
+  fp64_gpu.CopyFromCpuVector(fp64_cpu);
+  CudaBuffer<float> fp32_gpu(&context);
   fp32_gpu.Reserve(fp64_cpu.size());
   CudaFP64ToFP32(
       fp64_gpu.data(), fp32_gpu.data(), fp64_cpu.size(), cudaStreamDefault);
@@ -63,6 +67,9 @@
 }
 
 TEST(CudaFP64ToFP32, NumericallyExtremeValues) {
+  ContextImpl context;
+  std::string cuda_error;
+  EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
   std::vector<double> fp64_cpu = {
       DBL_MIN, 10.0 * DBL_MIN, DBL_MAX, 0.1 * DBL_MAX};
   // First just make sure that the compiler has represented these values
@@ -71,9 +78,9 @@
   EXPECT_GT(fp64_cpu[1], 0.0);
   EXPECT_TRUE(std::isfinite(fp64_cpu[2]));
   EXPECT_TRUE(std::isfinite(fp64_cpu[3]));
-  CudaBuffer<double> fp64_gpu;
-  fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault);
-  CudaBuffer<float> fp32_gpu;
+  CudaBuffer<double> fp64_gpu(&context);
+  fp64_gpu.CopyFromCpuVector(fp64_cpu);
+  CudaBuffer<float> fp32_gpu(&context);
   fp32_gpu.Reserve(fp64_cpu.size());
   CudaFP64ToFP32(
       fp64_gpu.data(), fp32_gpu.data(), fp64_cpu.size(), cudaStreamDefault);
@@ -86,10 +93,13 @@
 }
 
 TEST(CudaFP32ToFP64, SimpleConversions) {
+  ContextImpl context;
+  std::string cuda_error;
+  EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
   std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
-  CudaBuffer<float> fp32_gpu;
-  fp32_gpu.CopyFromCpuVector(fp32_cpu, cudaStreamDefault);
-  CudaBuffer<double> fp64_gpu;
+  CudaBuffer<float> fp32_gpu(&context);
+  fp32_gpu.CopyFromCpuVector(fp32_cpu);
+  CudaBuffer<double> fp64_gpu(&context);
   fp64_gpu.Reserve(fp32_cpu.size());
   CudaFP32ToFP64(
       fp32_gpu.data(), fp64_gpu.data(), fp32_cpu.size(), cudaStreamDefault);
@@ -101,9 +111,12 @@
 }
 
 TEST(CudaSetZeroFP32, NonZeroInput) {
+  ContextImpl context;
+  std::string cuda_error;
+  EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
   std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
-  CudaBuffer<float> fp32_gpu;
-  fp32_gpu.CopyFromCpuVector(fp32_cpu, cudaStreamDefault);
+  CudaBuffer<float> fp32_gpu(&context);
+  fp32_gpu.CopyFromCpuVector(fp32_cpu);
   CudaSetZeroFP32(fp32_gpu.data(), fp32_cpu.size(), cudaStreamDefault);
   std::vector<float> fp32_cpu_zero(fp32_cpu.size());
   fp32_gpu.CopyToCpu(fp32_cpu_zero.data(), fp32_cpu_zero.size());
@@ -113,9 +126,12 @@
 }
 
 TEST(CudaSetZeroFP64, NonZeroInput) {
+  ContextImpl context;
+  std::string cuda_error;
+  EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
   std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
-  CudaBuffer<double> fp64_gpu;
-  fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault);
+  CudaBuffer<double> fp64_gpu(&context);
+  fp64_gpu.CopyFromCpuVector(fp64_cpu);
   CudaSetZeroFP64(fp64_gpu.data(), fp64_cpu.size(), cudaStreamDefault);
   std::vector<double> fp64_cpu_zero(fp64_cpu.size());
   fp64_gpu.CopyToCpu(fp64_cpu_zero.data(), fp64_cpu_zero.size());
@@ -125,13 +141,16 @@
 }
 
 TEST(CudaDsxpy, DoubleValues) {
+  ContextImpl context;
+  std::string cuda_error;
+  EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
   std::vector<float> fp32_cpu_a = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
   std::vector<double> fp64_cpu_b = {
       1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
-  CudaBuffer<float> fp32_gpu_a;
-  fp32_gpu_a.CopyFromCpuVector(fp32_cpu_a, cudaStreamDefault);
-  CudaBuffer<double> fp64_gpu_b;
-  fp64_gpu_b.CopyFromCpuVector(fp64_cpu_b, cudaStreamDefault);
+  CudaBuffer<float> fp32_gpu_a(&context);
+  fp32_gpu_a.CopyFromCpuVector(fp32_cpu_a);
+  CudaBuffer<double> fp64_gpu_b(&context);
+  fp64_gpu_b.CopyFromCpuVector(fp64_cpu_b);
   CudaDsxpy(fp64_gpu_b.data(),
             fp32_gpu_a.data(),
             fp32_gpu_a.size(),
@@ -143,15 +162,18 @@
 }
 
 TEST(CudaDtDxpy, ComputeFourItems) {
+  ContextImpl context;
+  std::string cuda_error;
+  EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
   std::vector<double> x_cpu = {1, 2, 3, 4};
   std::vector<double> y_cpu = {4, 3, 2, 1};
   std::vector<double> d_cpu = {10, 20, 30, 40};
-  CudaBuffer<double> x_gpu;
-  x_gpu.CopyFromCpuVector(x_cpu, cudaStreamDefault);
-  CudaBuffer<double> y_gpu;
-  y_gpu.CopyFromCpuVector(y_cpu, cudaStreamDefault);
-  CudaBuffer<double> d_gpu;
-  d_gpu.CopyFromCpuVector(d_cpu, cudaStreamDefault);
+  CudaBuffer<double> x_gpu(&context);
+  x_gpu.CopyFromCpuVector(x_cpu);
+  CudaBuffer<double> y_gpu(&context);
+  y_gpu.CopyFromCpuVector(y_cpu);
+  CudaBuffer<double> d_gpu(&context);
+  d_gpu.CopyFromCpuVector(d_cpu);
   CudaDtDxpy(y_gpu.data(),
              d_gpu.data(),
              x_gpu.data(),
diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc
index 1e361d2..7ee1761 100644
--- a/internal/ceres/cuda_sparse_matrix.cc
+++ b/internal/ceres/cuda_sparse_matrix.cc
@@ -59,17 +59,21 @@
 
 namespace ceres::internal {
 
-CudaSparseMatrix::CudaSparseMatrix(
-    ContextImpl* context, const CompressedRowSparseMatrix& crs_matrix) {
+CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context,
+                                   const CompressedRowSparseMatrix& crs_matrix)
+    : context_(context),
+      rows_{context},
+      cols_{context},
+      values_{context},
+      spmv_buffer_{context} {
   DCHECK_NE(context, nullptr);
   CHECK(context->IsCudaInitialized());
-  context_ = context;
   num_rows_ = crs_matrix.num_rows();
   num_cols_ = crs_matrix.num_cols();
   num_nonzeros_ = crs_matrix.num_nonzeros();
-  rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1, context_->stream_);
-  cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_, context_->stream_);
-  values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_, context_->stream_);
+  rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1);
+  cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_);
+  values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
   cusparseCreateCsr(&descr_,
                     num_rows_,
                     num_cols_,
@@ -96,7 +100,7 @@
   CHECK_EQ(num_rows_, crs_matrix.num_rows());
   CHECK_EQ(num_cols_, crs_matrix.num_cols());
   CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros());
-  values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_, context_->stream_);
+  values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
 }
 
 void CudaSparseMatrix::SpMv(cusparseOperation_t op,
diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h
index 62f6b77..d5f3332 100644
--- a/internal/ceres/cuda_sparse_matrix.h
+++ b/internal/ceres/cuda_sparse_matrix.h
@@ -97,6 +97,7 @@
   int num_cols_ = 0;
   int num_nonzeros_ = 0;
 
+  ContextImpl* context_ = nullptr;
   // CSR row indices.
   CudaBuffer<int32_t> rows_;
   // CSR column indices.
@@ -104,8 +105,6 @@
   // CSR values.
   CudaBuffer<double> values_;
 
-  ContextImpl* context_ = nullptr;
-
   // CuSparse object that describes this matrix.
   cusparseSpMatDescr_t descr_ = nullptr;
 
diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc
index 46e6cb2..274a47a 100644
--- a/internal/ceres/cuda_vector.cc
+++ b/internal/ceres/cuda_vector.cc
@@ -50,17 +50,17 @@
 
 namespace ceres::internal {
 
-CudaVector::CudaVector(ContextImpl* context, int size) {
+CudaVector::CudaVector(ContextImpl* context, int size)
+    : context_(context), data_(context, size) {
   DCHECK_NE(context, nullptr);
-  CHECK(context->IsCudaInitialized());
-  context_ = context;
+  DCHECK(context->IsCudaInitialized());
   Resize(size);
 }
 
 CudaVector& CudaVector::operator=(const CudaVector& other) {
   if (this != &other) {
     Resize(other.num_rows());
-    data_.CopyFromGPUArray(other.data_.data(), num_rows_, context_->stream_);
+    data_.CopyFromGPUArray(other.data_.data(), num_rows_);
   }
   return *this;
 }
@@ -107,7 +107,7 @@
 
 void CudaVector::CopyFromCpu(const Vector& x) {
   data_.Reserve(x.rows());
-  data_.CopyFromCpu(x.data(), x.rows(), context_->stream_);
+  data_.CopyFromCpu(x.data(), x.rows());
   num_rows_ = x.rows();
   DestroyDescriptor();
   CHECK_EQ(cusparseCreateDnVec(&descr_, num_rows_, data_.data(), CUDA_R_64F),
@@ -117,17 +117,11 @@
 void CudaVector::CopyTo(Vector* x) const {
   CHECK(x != nullptr);
   x->resize(num_rows_);
-  // Need to synchronize with any GPU kernels that may be writing to the
-  // buffer before the transfer happens.
-  CHECK_EQ(cudaStreamSynchronize(context_->stream_), cudaSuccess);
   data_.CopyToCpu(x->data(), num_rows_);
 }
 
 void CudaVector::CopyTo(double* x) const {
   CHECK(x != nullptr);
-  // Need to synchronize with any GPU kernels that may be writing to the
-  // buffer before the transfer happens.
-  CHECK_EQ(cudaStreamSynchronize(context_->stream_), cudaSuccess);
   data_.CopyToCpu(x, num_rows_);
 }
 
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index 6d328ab..93e9f0d 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc
@@ -347,15 +347,12 @@
 
 #ifndef CERES_NO_CUDA
 
-bool CUDADenseCholesky::Init(ContextImpl* context, std::string* message) {
-  CHECK(context->IsCudaInitialized())
-      << "CUDADenseCholesky requires CUDA initialization.";
-  cusolver_handle_ = context->cusolver_handle_;
-  stream_ = context->stream_;
-  error_.Reserve(1);
-  *message = "CUDADenseCholesky::Init Success.";
-  return true;
-}
+CUDADenseCholesky::CUDADenseCholesky(ContextImpl* context)
+    : context_(context),
+      lhs_{context},
+      rhs_{context},
+      device_workspace_{context},
+      error_(context, 1) {}
 
 LinearSolverTerminationType CUDADenseCholesky::Factorize(int num_cols,
                                                          double* lhs,
@@ -363,9 +360,9 @@
   factorize_result_ = LinearSolverTerminationType::FATAL_ERROR;
   lhs_.Reserve(num_cols * num_cols);
   num_cols_ = num_cols;
-  lhs_.CopyFromCpu(lhs, num_cols * num_cols, stream_);
+  lhs_.CopyFromCpu(lhs, num_cols * num_cols);
   int device_workspace_size = 0;
-  if (cusolverDnDpotrf_bufferSize(cusolver_handle_,
+  if (cusolverDnDpotrf_bufferSize(context_->cusolver_handle_,
                                   CUBLAS_FILL_MODE_LOWER,
                                   num_cols,
                                   lhs_.data(),
@@ -376,7 +373,7 @@
     return LinearSolverTerminationType::FATAL_ERROR;
   }
   device_workspace_.Reserve(device_workspace_size);
-  if (cusolverDnDpotrf(cusolver_handle_,
+  if (cusolverDnDpotrf(context_->cusolver_handle_,
                        CUBLAS_FILL_MODE_LOWER,
                        num_cols,
                        lhs_.data(),
@@ -387,11 +384,6 @@
     *message = "cuSolverDN::cusolverDnDpotrf failed.";
     return LinearSolverTerminationType::FATAL_ERROR;
   }
-  if (cudaDeviceSynchronize() != cudaSuccess ||
-      cudaStreamSynchronize(stream_) != cudaSuccess) {
-    *message = "Cuda device synchronization failed.";
-    return LinearSolverTerminationType::FATAL_ERROR;
-  }
   int error = 0;
   error_.CopyToCpu(&error, 1);
   if (error < 0) {
@@ -422,8 +414,8 @@
     *message = "Factorize did not complete successfully previously.";
     return factorize_result_;
   }
-  rhs_.CopyFromCpu(rhs, num_cols_, stream_);
-  if (cusolverDnDpotrs(cusolver_handle_,
+  rhs_.CopyFromCpu(rhs, num_cols_);
+  if (cusolverDnDpotrs(context_->cusolver_handle_,
                        CUBLAS_FILL_MODE_LOWER,
                        num_cols_,
                        1,
@@ -435,11 +427,6 @@
     *message = "cuSolverDN::cusolverDnDpotrs failed.";
     return LinearSolverTerminationType::FATAL_ERROR;
   }
-  if (cudaDeviceSynchronize() != cudaSuccess ||
-      cudaStreamSynchronize(stream_) != cudaSuccess) {
-    *message = "Cuda device synchronization failed.";
-    return LinearSolverTerminationType::FATAL_ERROR;
-  }
   int error = 0;
   error_.CopyToCpu(&error, 1);
   if (error != 0) {
@@ -455,56 +442,30 @@
 
 std::unique_ptr<CUDADenseCholesky> CUDADenseCholesky::Create(
     const LinearSolver::Options& options) {
-  if (options.dense_linear_algebra_library_type != CUDA) {
-    // The user called the wrong factory method.
+  if (options.dense_linear_algebra_library_type != CUDA ||
+      options.context == nullptr || !options.context->IsCudaInitialized()) {
     return nullptr;
   }
-  auto cuda_dense_cholesky =
-      std::unique_ptr<CUDADenseCholesky>(new CUDADenseCholesky());
-  std::string cuda_error;
-  if (cuda_dense_cholesky->Init(options.context, &cuda_error)) {
-    return cuda_dense_cholesky;
-  }
-  // Initialization failed, destroy the object (done automatically) and return
-  // a nullptr.
-  LOG(ERROR) << "CUDADenseCholesky::Init failed: " << cuda_error;
-  return nullptr;
+  return std::unique_ptr<CUDADenseCholesky>(
+      new CUDADenseCholesky(options.context));
 }
 
 std::unique_ptr<CUDADenseCholeskyMixedPrecision>
 CUDADenseCholeskyMixedPrecision::Create(const LinearSolver::Options& options) {
   if (options.dense_linear_algebra_library_type != CUDA ||
-      !options.use_mixed_precision_solves) {
-    // The user called the wrong factory method.
+      !options.use_mixed_precision_solves || options.context == nullptr ||
+      !options.context->IsCudaInitialized()) {
     return nullptr;
   }
-  auto solver = std::unique_ptr<CUDADenseCholeskyMixedPrecision>(
-      new CUDADenseCholeskyMixedPrecision());
-  std::string cuda_error;
-  if (solver->Init(options, &cuda_error)) {
-    return solver;
-  }
-  LOG(ERROR) << "CUDADenseCholeskyMixedPrecision::Init failed: " << cuda_error;
-  return nullptr;
-}
-
-bool CUDADenseCholeskyMixedPrecision::Init(const LinearSolver::Options& options,
-                                           std::string* message) {
-  CHECK(options.context->IsCudaInitialized())
-      << "CUDADenseCholeskyMixedPrecision requires CUDA initialization.";
-  cusolver_handle_ = options.context->cusolver_handle_;
-  cublas_handle_ = options.context->cublas_handle_;
-  stream_ = options.context->stream_;
-  error_.Reserve(1);
-  max_num_refinement_iterations_ = options.max_num_refinement_iterations;
-  *message = "CUDADenseCholeskyMixedPrecision::Init Success.";
-  return true;
+  return std::unique_ptr<CUDADenseCholeskyMixedPrecision>(
+      new CUDADenseCholeskyMixedPrecision(
+          options.context, options.max_num_refinement_iterations));
 }
 
 LinearSolverTerminationType
 CUDADenseCholeskyMixedPrecision::CudaCholeskyFactorize(std::string* message) {
   int device_workspace_size = 0;
-  if (cusolverDnSpotrf_bufferSize(cusolver_handle_,
+  if (cusolverDnSpotrf_bufferSize(context_->cusolver_handle_,
                                   CUBLAS_FILL_MODE_LOWER,
                                   num_cols_,
                                   lhs_fp32_.data(),
@@ -515,7 +476,7 @@
     return LinearSolverTerminationType::FATAL_ERROR;
   }
   device_workspace_.Reserve(device_workspace_size);
-  if (cusolverDnSpotrf(cusolver_handle_,
+  if (cusolverDnSpotrf(context_->cusolver_handle_,
                        CUBLAS_FILL_MODE_LOWER,
                        num_cols_,
                        lhs_fp32_.data(),
@@ -526,11 +487,6 @@
     *message = "cuSolverDN::cusolverDnSpotrf failed.";
     return LinearSolverTerminationType::FATAL_ERROR;
   }
-  if (cudaDeviceSynchronize() != cudaSuccess ||
-      cudaStreamSynchronize(stream_) != cudaSuccess) {
-    *message = "Cuda device synchronization failed.";
-    return LinearSolverTerminationType::FATAL_ERROR;
-  }
   int error = 0;
   error_.CopyToCpu(&error, 1);
   if (error < 0) {
@@ -560,9 +516,9 @@
                            residual_fp32_.data(),
                            num_cols_ * sizeof(float),
                            cudaMemcpyDeviceToDevice,
-                           stream_),
+                           context_->stream_),
            cudaSuccess);
-  if (cusolverDnSpotrs(cusolver_handle_,
+  if (cusolverDnSpotrs(context_->cusolver_handle_,
                        CUBLAS_FILL_MODE_LOWER,
                        num_cols_,
                        1,
@@ -574,11 +530,6 @@
     *message = "cuSolverDN::cusolverDnDpotrs failed.";
     return LinearSolverTerminationType::FATAL_ERROR;
   }
-  if (cudaDeviceSynchronize() != cudaSuccess ||
-      cudaStreamSynchronize(stream_) != cudaSuccess) {
-    *message = "Cuda device synchronization failed.";
-    return LinearSolverTerminationType::FATAL_ERROR;
-  }
   int error = 0;
   error_.CopyToCpu(&error, 1);
   if (error != 0) {
@@ -591,18 +542,34 @@
   return LinearSolverTerminationType::SUCCESS;
 }
 
+CUDADenseCholeskyMixedPrecision::CUDADenseCholeskyMixedPrecision(
+    ContextImpl* context, int max_num_refinement_iterations)
+    : context_(context),
+      lhs_fp64_{context},
+      rhs_fp64_{context},
+      lhs_fp32_{context},
+      device_workspace_{context},
+      error_(context, 1),
+      x_fp64_{context},
+      correction_fp32_{context},
+      residual_fp32_{context},
+      residual_fp64_{context},
+      max_num_refinement_iterations_(max_num_refinement_iterations) {}
+
 LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::Factorize(
     int num_cols, double* lhs, std::string* message) {
   num_cols_ = num_cols;
 
   // Copy fp64 version of lhs to GPU.
   lhs_fp64_.Reserve(num_cols * num_cols);
-  lhs_fp64_.CopyFromCpu(lhs, num_cols * num_cols, stream_);
+  lhs_fp64_.CopyFromCpu(lhs, num_cols * num_cols);
 
   // Create an fp32 copy of lhs, lhs_fp32.
   lhs_fp32_.Reserve(num_cols * num_cols);
-  CudaFP64ToFP32(
-      lhs_fp64_.data(), lhs_fp32_.data(), num_cols * num_cols, stream_);
+  CudaFP64ToFP32(lhs_fp64_.data(),
+                 lhs_fp32_.data(),
+                 num_cols * num_cols,
+                 context_->stream_);
 
   // Factorize lhs_fp32.
   factorize_result_ = CudaCholeskyFactorize(message);
@@ -625,32 +592,35 @@
   residual_fp64_.Reserve(num_cols_);
 
   // Initialize x = 0.
-  CudaSetZeroFP64(x_fp64_.data(), num_cols_, stream_);
+  CudaSetZeroFP64(x_fp64_.data(), num_cols_, context_->stream_);
 
   // Initialize residual = rhs.
-  rhs_fp64_.CopyFromCpu(rhs, num_cols_, stream_);
-  residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_, stream_);
+  rhs_fp64_.CopyFromCpu(rhs, num_cols_);
+  residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_);
 
   for (int i = 0; i <= max_num_refinement_iterations_; ++i) {
     // Cast residual from fp64 to fp32.
-    CudaFP64ToFP32(
-        residual_fp64_.data(), residual_fp32_.data(), num_cols_, stream_);
+    CudaFP64ToFP32(residual_fp64_.data(),
+                   residual_fp32_.data(),
+                   num_cols_,
+                   context_->stream_);
     // [fp32] c = lhs^-1 * residual.
     auto result = CudaCholeskySolve(message);
     if (result != LinearSolverTerminationType::SUCCESS) {
       return result;
     }
     // [fp64] x += c.
-    CudaDsxpy(x_fp64_.data(), correction_fp32_.data(), num_cols_, stream_);
+    CudaDsxpy(
+        x_fp64_.data(), correction_fp32_.data(), num_cols_, context_->stream_);
     if (i < max_num_refinement_iterations_) {
       // [fp64] residual = rhs - lhs * x
       // This is done in two steps:
       // 1. [fp64] residual = rhs
-      residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_, stream_);
+      residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_);
       // 2. [fp64] residual = residual - lhs * x
       double alpha = -1.0;
       double beta = 1.0;
-      cublasDsymv(cublas_handle_,
+      cublasDsymv(context_->cublas_handle_,
                   CUBLAS_FILL_MODE_LOWER,
                   num_cols_,
                   &alpha,
diff --git a/internal/ceres/dense_cholesky.h b/internal/ceres/dense_cholesky.h
index 1c561c1..6380fc6 100644
--- a/internal/ceres/dense_cholesky.h
+++ b/internal/ceres/dense_cholesky.h
@@ -40,6 +40,7 @@
 #include <vector>
 
 #include "Eigen/Dense"
+#include "ceres/context_impl.h"
 #include "ceres/cuda_buffer.h"
 #include "ceres/linear_solver.h"
 #include "glog/logging.h"
@@ -208,16 +209,9 @@
                                     std::string* message) override;
 
  private:
-  CUDADenseCholesky() = default;
-  // Picks up the cuSolverDN and cuStream handles from the context. If
-  // the context is unable to initialize CUDA, returns false with a
-  // human-readable message indicating the reason.
-  bool Init(ContextImpl* context, std::string* message);
+  explicit CUDADenseCholesky(ContextImpl* context);
 
-  // Handle to the cuSOLVER context.
-  cusolverDnHandle_t cusolver_handle_ = nullptr;
-  // CUDA device stream.
-  cudaStream_t stream_ = nullptr;
+  ContextImpl* context_ = nullptr;
   // Number of columns in the A matrix, to be cached between calls to *Factorize
   // and *Solve.
   size_t num_cols_ = 0;
@@ -266,7 +260,9 @@
                                     std::string* message) override;
 
  private:
-  CUDADenseCholeskyMixedPrecision() = default;
+  CUDADenseCholeskyMixedPrecision(ContextImpl* context,
+                                  int max_num_refinement_iterations);
+
   // Helper function to wrap Cuda boilerplate needed to call Spotrf.
   LinearSolverTerminationType CudaCholeskyFactorize(std::string* message);
   // Helper function to wrap Cuda boilerplate needed to call Spotrs.
@@ -277,9 +273,7 @@
   // human-readable message indicating the reason.
   bool Init(const LinearSolver::Options& options, std::string* message);
 
-  cusolverDnHandle_t cusolver_handle_ = nullptr;
-  cublasHandle_t cublas_handle_ = nullptr;
-  cudaStream_t stream_ = nullptr;
+  ContextImpl* context_ = nullptr;
   // Number of columns in the A matrix, to be cached between calls to *Factorize
   // and *Solve.
   size_t num_cols_ = 0;
diff --git a/internal/ceres/dense_qr.cc b/internal/ceres/dense_qr.cc
index 22727f8..5154fb1 100644
--- a/internal/ceres/dense_qr.cc
+++ b/internal/ceres/dense_qr.cc
@@ -311,17 +311,13 @@
 
 #ifndef CERES_NO_CUDA
 
-bool CUDADenseQR::Init(ContextImpl* context, std::string* message) {
-  if (!context->InitCuda(message)) {
-    return false;
-  }
-  cublas_handle_ = context->cublas_handle_;
-  cusolver_handle_ = context->cusolver_handle_;
-  stream_ = context->stream_;
-  error_.Reserve(1);
-  *message = "CUDADenseQR::Init Success.";
-  return true;
-}
+CUDADenseQR::CUDADenseQR(ContextImpl* context)
+    : context_(context),
+      lhs_{context},
+      rhs_{context},
+      tau_{context},
+      device_workspace_{context},
+      error_(context, 1) {}
 
 LinearSolverTerminationType CUDADenseQR::Factorize(int num_rows,
                                                    int num_cols,
@@ -332,9 +328,9 @@
   tau_.Reserve(std::min(num_rows, num_cols));
   num_rows_ = num_rows;
   num_cols_ = num_cols;
-  lhs_.CopyFromCpu(lhs, num_rows * num_cols, stream_);
+  lhs_.CopyFromCpu(lhs, num_rows * num_cols);
   int device_workspace_size = 0;
-  if (cusolverDnDgeqrf_bufferSize(cusolver_handle_,
+  if (cusolverDnDgeqrf_bufferSize(context_->cusolver_handle_,
                                   num_rows,
                                   num_cols,
                                   lhs_.data(),
@@ -345,7 +341,7 @@
     return LinearSolverTerminationType::FATAL_ERROR;
   }
   device_workspace_.Reserve(device_workspace_size);
-  if (cusolverDnDgeqrf(cusolver_handle_,
+  if (cusolverDnDgeqrf(context_->cusolver_handle_,
                        num_rows,
                        num_cols,
                        lhs_.data(),
@@ -357,11 +353,6 @@
     *message = "cuSolverDN::cusolverDnDgeqrf failed.";
     return LinearSolverTerminationType::FATAL_ERROR;
   }
-  if (cudaDeviceSynchronize() != cudaSuccess ||
-      cudaStreamSynchronize(stream_) != cudaSuccess) {
-    *message = "Cuda device synchronization failed.";
-    return LinearSolverTerminationType::FATAL_ERROR;
-  }
   int error = 0;
   error_.CopyToCpu(&error, 1);
   if (error < 0) {
@@ -386,9 +377,9 @@
     *message = "Factorize did not complete successfully previously.";
     return factorize_result_;
   }
-  rhs_.CopyFromCpu(rhs, num_rows_, stream_);
+  rhs_.CopyFromCpu(rhs, num_rows_);
   int device_workspace_size = 0;
-  if (cusolverDnDormqr_bufferSize(cusolver_handle_,
+  if (cusolverDnDormqr_bufferSize(context_->cusolver_handle_,
                                   CUBLAS_SIDE_LEFT,
                                   CUBLAS_OP_T,
                                   num_rows_,
@@ -407,7 +398,7 @@
   device_workspace_.Reserve(device_workspace_size);
   // Compute rhs = Q^T * rhs, assuming that lhs has already been factorized.
   // The result of factorization would have stored Q in a packed form in lhs_.
-  if (cusolverDnDormqr(cusolver_handle_,
+  if (cusolverDnDormqr(context_->cusolver_handle_,
                        CUBLAS_SIDE_LEFT,
                        CUBLAS_OP_T,
                        num_rows_,
@@ -434,7 +425,7 @@
   }
   // Compute the solution vector as x = R \ (Q^T * rhs). Since the previous step
   // replaced rhs by (Q^T * rhs), this is just x = R \ rhs.
-  if (cublasDtrsv(cublas_handle_,
+  if (cublasDtrsv(context_->cublas_handle_,
                   CUBLAS_FILL_MODE_UPPER,
                   CUBLAS_OP_N,
                   CUBLAS_DIAG_NON_UNIT,
@@ -446,11 +437,6 @@
     *message = "cuBLAS::cublasDtrsv failed.";
     return LinearSolverTerminationType::FATAL_ERROR;
   }
-  if (cudaDeviceSynchronize() != cudaSuccess ||
-      cudaStreamSynchronize(stream_) != cudaSuccess) {
-    *message = "Cuda device synchronization failed.";
-    return LinearSolverTerminationType::FATAL_ERROR;
-  }
   rhs_.CopyToCpu(solution, num_cols_);
   *message = "Success";
   return LinearSolverTerminationType::SUCCESS;
@@ -458,23 +444,13 @@
 
 std::unique_ptr<CUDADenseQR> CUDADenseQR::Create(
     const LinearSolver::Options& options) {
-  if (options.dense_linear_algebra_library_type != CUDA) {
-    // The user called the wrong factory method.
+  if (options.dense_linear_algebra_library_type != CUDA ||
+      options.context == nullptr || !options.context->IsCudaInitialized()) {
     return nullptr;
   }
-  auto cuda_dense_qr = std::unique_ptr<CUDADenseQR>(new CUDADenseQR());
-  std::string cuda_error;
-  if (cuda_dense_qr->Init(options.context, &cuda_error)) {
-    return cuda_dense_qr;
-  }
-  // Initialization failed, destroy the object (done automatically) and return a
-  // nullptr.
-  LOG(ERROR) << "CUDADenseQR::Init failed: " << cuda_error;
-  return nullptr;
+  return std::unique_ptr<CUDADenseQR>(new CUDADenseQR(options.context));
 }
 
-CUDADenseQR::CUDADenseQR() = default;
-
 #endif  // CERES_NO_CUDA
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/dense_qr.h b/internal/ceres/dense_qr.h
index 0d2577a..270a912 100644
--- a/internal/ceres/dense_qr.h
+++ b/internal/ceres/dense_qr.h
@@ -40,6 +40,7 @@
 #include <vector>
 
 #include "Eigen/Dense"
+#include "ceres/context_impl.h"
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"
@@ -164,18 +165,9 @@
                                     std::string* message) override;
 
  private:
-  CUDADenseQR();
-  // Picks up the cuSolverDN, cuBLAS, and cuStream handles from the context. If
-  // the context is unable to initialize CUDA, returns false with a
-  // human-readable message indicating the reason.
-  bool Init(ContextImpl* context, std::string* message);
+  explicit CUDADenseQR(ContextImpl* context);
 
-  // Handle to the cuSOLVER context.
-  cusolverDnHandle_t cusolver_handle_ = nullptr;
-  // Handle to cuBLAS context.
-  cublasHandle_t cublas_handle_ = nullptr;
-  // CUDA device stream.
-  cudaStream_t stream_ = nullptr;
+  ContextImpl* context_ = nullptr;
   // Number of rowns in the A matrix, to be cached between calls to *Factorize
   // and *Solve.
   size_t num_rows_ = 0;