CUDA Cleanup

* Renamed several interfaces to CudaBuffer for clarity and consistency.
* Added unit tests for custom Cuda kernels.
* Set specific CUDA architectures if the CMake version supports it.

Change-Id: I269fb1089b80b25e17bca772ef8d70e7894214b8
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c2e1e4e..b1fcddd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -237,6 +237,11 @@
         "${CUDA_LIBRARIES};"
         "${CUDA_cusolver_LIBRARY};"
         "${CUDA_cusparse_LIBRARY}")
+    if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")
+      # Support Maxwell, Pascal, Volta, Turing, and Ampere GPUs.
+      set(CMAKE_CUDA_ARCHITECTURES "50;60;70;80")
+      message("-- Setting CUDA Architecture to ${CMAKE_CUDA_ARCHITECTURES}")
+    endif()
     enable_language(CUDA)
   else (CUDA_FOUND)
     message("-- Did not find CUDA library, disabling CUDA support.")
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 75df3c6..ea21df2 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -482,6 +482,7 @@
   ceres_test(cubic_interpolation)
   ceres_test(cuda_dense_cholesky)
   ceres_test(cuda_dense_qr)
+  ceres_test(cuda_kernels)
   ceres_test(dense_linear_solver)
   ceres_test(dense_cholesky)
   ceres_test(dense_qr)
diff --git a/internal/ceres/ceres_cuda_kernels.cu b/internal/ceres/ceres_cuda_kernels.cu
index 2fd76b5..cd045e3 100644
--- a/internal/ceres/ceres_cuda_kernels.cu
+++ b/internal/ceres/ceres_cuda_kernels.cu
@@ -106,4 +106,24 @@
       <<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size);
 }
 
+__global__ void CudaDtDxpyKernel(double* __restrict__ y,
+                                 const double* D,
+                                 const double* __restrict__ x,
+                                 const int size) {
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < size) {
+    y[i] = y[i] + D[i] * D[i] * x[i];
+  }
+}
+
+void CudaDtDxpy(double* y,
+                const double* D,
+                const double* x,
+                const int size,
+                cudaStream_t stream) {
+  const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+  CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(
+      y, D, x, size);
+}
+
 } // namespace ceres_cuda_kernels
\ No newline at end of file
diff --git a/internal/ceres/ceres_cuda_kernels.h b/internal/ceres/ceres_cuda_kernels.h
index f033147..91675e8 100644
--- a/internal/ceres/ceres_cuda_kernels.h
+++ b/internal/ceres/ceres_cuda_kernels.h
@@ -62,6 +62,13 @@
 // double (FP64). Both arrays must already be on GPU memory.
 void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream);
 
+// Compute y[i] = y[i] + d[i]^2 x[i]. All arrays must already be on GPU memory.
+void CudaDtDxpy(double* y,
+                const double* D,
+                const double* x,
+                const int size,
+                cudaStream_t stream);
+
 }  // namespace ceres::internal
 
 #endif  // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h
index 64774fa..89828f9 100644
--- a/internal/ceres/cuda_buffer.h
+++ b/internal/ceres/cuda_buffer.h
@@ -45,8 +45,7 @@
 // the appropriate GPU device is selected before each subroutine is called. This
 // is particularly important when using multiple GPU devices on different CPU
 // threads, since active Cuda devices are determined by the cuda runtime on a
-// per-thread basis. Note that unless otherwise specified, all methods use the
-// default stream, and are synchronous.
+// per-thread basis.
 template <typename T>
 class CudaBuffer {
  public:
@@ -72,17 +71,30 @@
     }
   }
 
-  // Perform an asynchronous copy from CPU memory to GPU memory using the stream
-  // provided.
-  void CopyToGpuAsync(const T* data, const size_t size, cudaStream_t stream) {
+  // Perform an asynchronous copy from CPU memory to GPU memory managed by this
+  // CudaBuffer instance using the stream provided.
+  void CopyFromCpu(const T* data, const size_t size, cudaStream_t stream) {
     Reserve(size);
     CHECK_EQ(cudaMemcpyAsync(
                  data_, data, size * sizeof(T), cudaMemcpyHostToDevice, stream),
              cudaSuccess);
   }
 
-  // Perform an asynchronous copy from GPU memory using the stream provided.
-  void CopyFromGpuAsync(const T* data, const size_t size, cudaStream_t stream) {
+  // Perform an asynchronous copy from a vector in CPU memory to GPU memory
+  // managed by this CudaBuffer instance.
+  void CopyFromCpuVector(const std::vector<T>& data, cudaStream_t stream) {
+    Reserve(data.size());
+    CHECK_EQ(cudaMemcpyAsync(data_,
+                             data.data(),
+                             data.size() * sizeof(T),
+                             cudaMemcpyHostToDevice,
+                             stream),
+             cudaSuccess);
+  }
+
+  // Perform an asynchronous copy from another GPU memory array to the GPU
+  // memory managed by this CudaBuffer instance using the stream provided.
+  void CopyFromGPUArray(const T* data, const size_t size, cudaStream_t stream) {
     Reserve(size);
     CHECK_EQ(
         cudaMemcpyAsync(
@@ -90,21 +102,38 @@
         cudaSuccess);
   }
 
-  // Copy data from the GPU to CPU memory. This is necessarily synchronous since
-  // any potential GPU kernels that may be writing to the buffer must finish
-  // before the transfer happens.
-  void CopyToHost(T* data, const size_t size) {
+  // Copy data from the GPU memory managed by this CudaBuffer instance to CPU
+  // memory. It is the caller's responsibility to ensure that the CPU memory
+  // pointer is valid, i.e. it is not null, and that it points to memory of
+  // at least this->size() size. This copy is necessarily synchronous since any
+  // potential GPU kernels that may be writing to the buffer must finish before
+  // the transfer happens.
+  void CopyToCpu(T* data, const size_t size) const {
     CHECK(data_ != nullptr);
     CHECK_EQ(cudaMemcpy(data, data_, size * sizeof(T), cudaMemcpyDeviceToHost),
              cudaSuccess);
   }
 
-  void CopyToGpu(const std::vector<T>& data) {
-    CopyToGpu(data.data(), data.size());
+  // Copy N items from another GPU memory array to the GPU memory managed by
+  // this CudaBuffer instance, growing this buffer's size if needed. This copy
+  // is asynchronous, and operates on the stream provided.
+  void CopyNItemsFrom(int n, const CudaBuffer<T>& other, cudaStream_t stream) {
+    Reserve(n);
+    CHECK(other.data_ != nullptr);
+    CHECK(data_ != nullptr);
+    CHECK_EQ(cudaMemcpyAsync(data_,
+                             other.data_,
+                             size_ * sizeof(T),
+                             cudaMemcpyDeviceToDevice,
+                             stream),
+             cudaSuccess);
   }
 
+  // Return a pointer to the GPU memory managed by this CudaBuffer instance.
   T* data() { return data_; }
   const T* data() const { return data_; }
+  // Return the number of items of type T that can fit in the GPU memory
+  // allocated so far by this CudaBuffer instance.
   size_t size() const { return size_; }
 
  private:
diff --git a/internal/ceres/cuda_kernels_test.cc b/internal/ceres/cuda_kernels_test.cc
new file mode 100644
index 0000000..fbdc4b3
--- /dev/null
+++ b/internal/ceres/cuda_kernels_test.cc
@@ -0,0 +1,179 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include <math.h>
+
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "ceres/ceres_cuda_kernels.h"
+#include "ceres/cuda_buffer.h"
+#include "ceres/internal/config.h"
+#include "ceres/internal/eigen.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace ceres {
+namespace internal {
+
+#ifndef CERES_NO_CUDA
+
+TEST(CudaFP64ToFP32, SimpleConversions) {
+  std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+  CudaBuffer<double> fp64_gpu;
+  fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault);
+  CudaBuffer<float> fp32_gpu;
+  fp32_gpu.Reserve(fp64_cpu.size());
+  CudaFP64ToFP32(fp64_gpu.data(),
+                 fp32_gpu.data(),
+                 fp64_cpu.size(),
+                 cudaStreamDefault);
+  std::vector<float> fp32_cpu(fp64_cpu.size());
+  fp32_gpu.CopyToCpu(fp32_cpu.data(), fp32_cpu.size());
+  for (int i = 0; i < fp32_cpu.size(); ++i) {
+    EXPECT_EQ(fp32_cpu[i], static_cast<float>(fp64_cpu[i]));
+  }
+}
+
+TEST(CudaFP64ToFP32, NumericallyExtremeValues) {
+  std::vector<double> fp64_cpu = {
+      DBL_MIN,
+      10.0 * DBL_MIN,
+      DBL_MAX,
+      0.1 * DBL_MAX
+  };
+  // First just make sure that the compiler has represented these values
+  // accurately as fp64.
+  EXPECT_GT(fp64_cpu[0], 0.0);
+  EXPECT_GT(fp64_cpu[1], 0.0);
+  EXPECT_TRUE(std::isfinite(fp64_cpu[2]));
+  EXPECT_TRUE(std::isfinite(fp64_cpu[3]));
+  CudaBuffer<double> fp64_gpu;
+  fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault);
+  CudaBuffer<float> fp32_gpu;
+  fp32_gpu.Reserve(fp64_cpu.size());
+  CudaFP64ToFP32(fp64_gpu.data(),
+                 fp32_gpu.data(),
+                 fp64_cpu.size(),
+                 cudaStreamDefault);
+  std::vector<float> fp32_cpu(fp64_cpu.size());
+  fp32_gpu.CopyToCpu(fp32_cpu.data(), fp32_cpu.size());
+  EXPECT_EQ(fp32_cpu[0], 0.0f);
+  EXPECT_EQ(fp32_cpu[1], 0.0f);
+  EXPECT_EQ(fp32_cpu[2], std::numeric_limits<float>::infinity());
+  EXPECT_EQ(fp32_cpu[3], std::numeric_limits<float>::infinity());
+}
+
+TEST(CudaFP32ToFP64, SimpleConversions) {
+  std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+  CudaBuffer<float> fp32_gpu;
+  fp32_gpu.CopyFromCpuVector(fp32_cpu, cudaStreamDefault);
+  CudaBuffer<double> fp64_gpu;
+  fp64_gpu.Reserve(fp32_cpu.size());
+  CudaFP32ToFP64(fp32_gpu.data(),
+                 fp64_gpu.data(),
+                 fp32_cpu.size(),
+                 cudaStreamDefault);
+  std::vector<double> fp64_cpu(fp32_cpu.size());
+  fp64_gpu.CopyToCpu(fp64_cpu.data(), fp64_cpu.size());
+  for (int i = 0; i < fp64_cpu.size(); ++i) {
+    EXPECT_EQ(fp64_cpu[i], static_cast<double>(fp32_cpu[i]));
+  }
+}
+
+TEST(CudaSetZeroFP32, NonZeroInput) {
+  std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+  CudaBuffer<float> fp32_gpu;
+  fp32_gpu.CopyFromCpuVector(fp32_cpu, cudaStreamDefault);
+  CudaSetZeroFP32(fp32_gpu.data(), fp32_cpu.size(), cudaStreamDefault);
+  std::vector<float> fp32_cpu_zero(fp32_cpu.size());
+  fp32_gpu.CopyToCpu(fp32_cpu_zero.data(), fp32_cpu_zero.size());
+  for (int i = 0; i < fp32_cpu_zero.size(); ++i) {
+    EXPECT_EQ(fp32_cpu_zero[i], 0.0f);
+  }
+}
+
+TEST(CudaSetZeroFP64, NonZeroInput) {
+  std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+  CudaBuffer<double> fp64_gpu;
+  fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault);
+  CudaSetZeroFP64(fp64_gpu.data(), fp64_cpu.size(), cudaStreamDefault);
+  std::vector<double> fp64_cpu_zero(fp64_cpu.size());
+  fp64_gpu.CopyToCpu(fp64_cpu_zero.data(), fp64_cpu_zero.size());
+  for (int i = 0; i < fp64_cpu_zero.size(); ++i) {
+    EXPECT_EQ(fp64_cpu_zero[i], 0.0);
+  }
+}
+
+TEST(CudaDsxpy, DoubleValues) {
+  std::vector<float> fp32_cpu_a = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+  std::vector<double> fp64_cpu_b =
+      {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+  CudaBuffer<float> fp32_gpu_a;
+  fp32_gpu_a.CopyFromCpuVector(fp32_cpu_a, cudaStreamDefault);
+  CudaBuffer<double> fp64_gpu_b;
+  fp64_gpu_b.CopyFromCpuVector(fp64_cpu_b, cudaStreamDefault);
+  CudaDsxpy(fp64_gpu_b.data(),
+            fp32_gpu_a.data(),
+            fp32_gpu_a.size(),
+            cudaStreamDefault);
+  fp64_gpu_b.CopyToCpu(fp64_cpu_b.data(), fp64_cpu_b.size());
+  for (int i = 0; i < fp64_cpu_b.size(); ++i) {
+    EXPECT_DOUBLE_EQ(fp64_cpu_b[i], 2.0 * fp32_cpu_a[i]);
+  }
+}
+
+TEST(CudaDtDxpy, ComputeFourItems) {
+  std::vector<double> x_cpu = {1, 2, 3, 4};
+  std::vector<double> y_cpu = {4, 3, 2, 1};
+  std::vector<double> d_cpu = {10, 20, 30, 40};
+  CudaBuffer<double> x_gpu;
+  x_gpu.CopyFromCpuVector(x_cpu, cudaStreamDefault);
+  CudaBuffer<double> y_gpu;
+  y_gpu.CopyFromCpuVector(y_cpu, cudaStreamDefault);
+  CudaBuffer<double> d_gpu;
+  d_gpu.CopyFromCpuVector(d_cpu, cudaStreamDefault);
+  CudaDtDxpy(y_gpu.data(),
+             d_gpu.data(),
+             x_gpu.data(),
+             y_gpu.size(),
+             cudaStreamDefault);
+  y_gpu.CopyToCpu(y_cpu.data(), y_cpu.size());
+  EXPECT_DOUBLE_EQ(y_cpu[0], 4.0 + 10.0 * 10.0 * 1.0);
+  EXPECT_DOUBLE_EQ(y_cpu[1], 3.0 + 20.0 * 20.0 * 2.0);
+  EXPECT_DOUBLE_EQ(y_cpu[2], 2.0 + 30.0 * 30.0 * 3.0);
+  EXPECT_DOUBLE_EQ(y_cpu[3], 1.0 + 40.0 * 40.0 * 4.0);
+}
+
+#endif  // CERES_NO_CUDA
+
+}  // namespace internal
+}  // namespace ceres
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index 16d3e1a..a6220e4 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc
@@ -363,7 +363,7 @@
   factorize_result_ = LinearSolverTerminationType::FATAL_ERROR;
   lhs_.Reserve(num_cols * num_cols);
   num_cols_ = num_cols;
-  lhs_.CopyToGpuAsync(lhs, num_cols * num_cols, stream_);
+  lhs_.CopyFromCpu(lhs, num_cols * num_cols, stream_);
   int device_workspace_size = 0;
   if (cusolverDnDpotrf_bufferSize(cusolver_handle_,
                                   CUBLAS_FILL_MODE_LOWER,
@@ -393,7 +393,7 @@
     return LinearSolverTerminationType::FATAL_ERROR;
   }
   int error = 0;
-  error_.CopyToHost(&error, 1);
+  error_.CopyToCpu(&error, 1);
   if (error < 0) {
     LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
                << "please report it. "
@@ -422,7 +422,7 @@
     *message = "Factorize did not complete successfully previously.";
     return factorize_result_;
   }
-  rhs_.CopyToGpuAsync(rhs, num_cols_, stream_);
+  rhs_.CopyFromCpu(rhs, num_cols_, stream_);
   if (cusolverDnDpotrs(cusolver_handle_,
                        CUBLAS_FILL_MODE_LOWER,
                        num_cols_,
@@ -441,14 +441,14 @@
     return LinearSolverTerminationType::FATAL_ERROR;
   }
   int error = 0;
-  error_.CopyToHost(&error, 1);
+  error_.CopyToCpu(&error, 1);
   if (error != 0) {
     LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
                << "Please report it."
                << "cuSolverDN::cusolverDnDpotrs fatal error. "
                << "Argument: " << -error << " is invalid.";
   }
-  rhs_.CopyToHost(solution, num_cols_);
+  rhs_.CopyToCpu(solution, num_cols_);
   *message = "Success";
   return LinearSolverTerminationType::SUCCESS;
 }
@@ -533,7 +533,7 @@
     return LinearSolverTerminationType::FATAL_ERROR;
   }
   int error = 0;
-  error_.CopyToHost(&error, 1);
+  error_.CopyToCpu(&error, 1);
   if (error < 0) {
     LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
                << "please report it. "
@@ -581,7 +581,7 @@
     return LinearSolverTerminationType::FATAL_ERROR;
   }
   int error = 0;
-  error_.CopyToHost(&error, 1);
+  error_.CopyToCpu(&error, 1);
   if (error != 0) {
     LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
                << "Please report it."
@@ -598,7 +598,7 @@
 
   // Copy fp64 version of lhs to GPU.
   lhs_fp64_.Reserve(num_cols * num_cols);
-  lhs_fp64_.CopyToGpuAsync(lhs, num_cols * num_cols, stream_);
+  lhs_fp64_.CopyFromCpu(lhs, num_cols * num_cols, stream_);
 
   // Create an fp32 copy of lhs, lhs_fp32.
   lhs_fp32_.Reserve(num_cols * num_cols);
@@ -629,8 +629,8 @@
   CudaSetZeroFP64(x_fp64_.data(), num_cols_, stream_);
 
   // Initialize residual = rhs.
-  rhs_fp64_.CopyToGpuAsync(rhs, num_cols_, stream_);
-  residual_fp64_.CopyFromGpuAsync(rhs_fp64_.data(), num_cols_, stream_);
+  rhs_fp64_.CopyFromCpu(rhs, num_cols_, stream_);
+  residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_, stream_);
 
   for (int i = 0; i <= max_num_refinement_iterations_; ++i) {
     // Cast residual from fp64 to fp32.
@@ -647,7 +647,7 @@
       // [fp64] residual = rhs - lhs * x
       // This is done in two steps:
       // 1. [fp64] residual = rhs
-      residual_fp64_.CopyFromGpuAsync(rhs_fp64_.data(), num_cols_, stream_);
+      residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_, stream_);
       // 2. [fp64] residual = residual - lhs * x
       double alpha = -1.0;
       double beta = 1.0;
@@ -664,7 +664,7 @@
                   1);
     }
   }
-  x_fp64_.CopyToHost(solution, num_cols_);
+  x_fp64_.CopyToCpu(solution, num_cols_);
   *message = "Success.";
   return LinearSolverTerminationType::SUCCESS;
 }
diff --git a/internal/ceres/dense_qr.cc b/internal/ceres/dense_qr.cc
index cbe7533..775073d 100644
--- a/internal/ceres/dense_qr.cc
+++ b/internal/ceres/dense_qr.cc
@@ -331,7 +331,7 @@
   tau_.Reserve(std::min(num_rows, num_cols));
   num_rows_ = num_rows;
   num_cols_ = num_cols;
-  lhs_.CopyToGpuAsync(lhs, num_rows * num_cols, stream_);
+  lhs_.CopyFromCpu(lhs, num_rows * num_cols, stream_);
   int device_workspace_size = 0;
   if (cusolverDnDgeqrf_bufferSize(cusolver_handle_,
                                   num_rows,
@@ -362,7 +362,7 @@
     return LinearSolverTerminationType::FATAL_ERROR;
   }
   int error = 0;
-  error_.CopyToHost(&error, 1);
+  error_.CopyToCpu(&error, 1);
   if (error < 0) {
     LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
                << "please report it. "
@@ -385,7 +385,7 @@
     *message = "Factorize did not complete successfully previously.";
     return factorize_result_;
   }
-  rhs_.CopyToGpuAsync(rhs, num_rows_, stream_);
+  rhs_.CopyFromCpu(rhs, num_rows_, stream_);
   int device_workspace_size = 0;
   if (cusolverDnDormqr_bufferSize(cusolver_handle_,
                                   CUBLAS_SIDE_LEFT,
@@ -424,7 +424,7 @@
     return LinearSolverTerminationType::FATAL_ERROR;
   }
   int error = 0;
-  error_.CopyToHost(&error, 1);
+  error_.CopyToCpu(&error, 1);
   if (error < 0) {
     LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
                << "Please report it."
@@ -450,7 +450,7 @@
     *message = "Cuda device synchronization failed.";
     return LinearSolverTerminationType::FATAL_ERROR;
   }
-  rhs_.CopyToHost(solution, num_cols_);
+  rhs_.CopyToCpu(solution, num_cols_);
   *message = "Success";
   return LinearSolverTerminationType::SUCCESS;
 }