CUDA Cleanup
* Renamed several interfaces to CudaBuffer for clarity and consistency.
* Added unit tests for custom Cuda kernels.
* Set specific CUDA architectures if the CMake version supports it.
Change-Id: I269fb1089b80b25e17bca772ef8d70e7894214b8
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c2e1e4e..b1fcddd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -237,6 +237,11 @@
"${CUDA_LIBRARIES};"
"${CUDA_cusolver_LIBRARY};"
"${CUDA_cusparse_LIBRARY}")
+ if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")
+ # Support Maxwell, Pascal, Volta, Turing, and Ampere GPUs.
+ set(CMAKE_CUDA_ARCHITECTURES "50;60;70;80")
+ message("-- Setting CUDA Architecture to ${CMAKE_CUDA_ARCHITECTURES}")
+ endif()
enable_language(CUDA)
else (CUDA_FOUND)
message("-- Did not find CUDA library, disabling CUDA support.")
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 75df3c6..ea21df2 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -482,6 +482,7 @@
ceres_test(cubic_interpolation)
ceres_test(cuda_dense_cholesky)
ceres_test(cuda_dense_qr)
+ ceres_test(cuda_kernels)
ceres_test(dense_linear_solver)
ceres_test(dense_cholesky)
ceres_test(dense_qr)
diff --git a/internal/ceres/ceres_cuda_kernels.cu b/internal/ceres/ceres_cuda_kernels.cu
index 2fd76b5..cd045e3 100644
--- a/internal/ceres/ceres_cuda_kernels.cu
+++ b/internal/ceres/ceres_cuda_kernels.cu
@@ -106,4 +106,24 @@
<<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size);
}
+__global__ void CudaDtDxpyKernel(double* __restrict__ y,
+ const double* D,
+ const double* __restrict__ x,
+ const int size) {
+ const int i = blockIdx.x * blockDim.x + threadIdx.x;
+ if (i < size) {
+ y[i] = y[i] + D[i] * D[i] * x[i];
+ }
+}
+
+void CudaDtDxpy(double* y,
+ const double* D,
+ const double* x,
+ const int size,
+ cudaStream_t stream) {
+ const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+ CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(
+ y, D, x, size);
+}
+
} // namespace ceres_cuda_kernels
\ No newline at end of file
diff --git a/internal/ceres/ceres_cuda_kernels.h b/internal/ceres/ceres_cuda_kernels.h
index f033147..91675e8 100644
--- a/internal/ceres/ceres_cuda_kernels.h
+++ b/internal/ceres/ceres_cuda_kernels.h
@@ -62,6 +62,13 @@
// double (FP64). Both arrays must already be on GPU memory.
void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream);
+// Compute y[i] = y[i] + d[i]^2 x[i]. All arrays must already be on GPU memory.
+void CudaDtDxpy(double* y,
+ const double* D,
+ const double* x,
+ const int size,
+ cudaStream_t stream);
+
} // namespace ceres::internal
#endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h
index 64774fa..89828f9 100644
--- a/internal/ceres/cuda_buffer.h
+++ b/internal/ceres/cuda_buffer.h
@@ -45,8 +45,7 @@
// the appropriate GPU device is selected before each subroutine is called. This
// is particularly important when using multiple GPU devices on different CPU
// threads, since active Cuda devices are determined by the cuda runtime on a
-// per-thread basis. Note that unless otherwise specified, all methods use the
-// default stream, and are synchronous.
+// per-thread basis.
template <typename T>
class CudaBuffer {
public:
@@ -72,17 +71,30 @@
}
}
- // Perform an asynchronous copy from CPU memory to GPU memory using the stream
- // provided.
- void CopyToGpuAsync(const T* data, const size_t size, cudaStream_t stream) {
+ // Perform an asynchronous copy from CPU memory to GPU memory managed by this
+ // CudaBuffer instance using the stream provided.
+ void CopyFromCpu(const T* data, const size_t size, cudaStream_t stream) {
Reserve(size);
CHECK_EQ(cudaMemcpyAsync(
data_, data, size * sizeof(T), cudaMemcpyHostToDevice, stream),
cudaSuccess);
}
- // Perform an asynchronous copy from GPU memory using the stream provided.
- void CopyFromGpuAsync(const T* data, const size_t size, cudaStream_t stream) {
+ // Perform an asynchronous copy from a vector in CPU memory to GPU memory
+ // managed by this CudaBuffer instance.
+ void CopyFromCpuVector(const std::vector<T>& data, cudaStream_t stream) {
+ Reserve(data.size());
+ CHECK_EQ(cudaMemcpyAsync(data_,
+ data.data(),
+ data.size() * sizeof(T),
+ cudaMemcpyHostToDevice,
+ stream),
+ cudaSuccess);
+ }
+
+ // Perform an asynchronous copy from another GPU memory array to the GPU
+ // memory managed by this CudaBuffer instance using the stream provided.
+ void CopyFromGPUArray(const T* data, const size_t size, cudaStream_t stream) {
Reserve(size);
CHECK_EQ(
cudaMemcpyAsync(
@@ -90,21 +102,38 @@
cudaSuccess);
}
- // Copy data from the GPU to CPU memory. This is necessarily synchronous since
- // any potential GPU kernels that may be writing to the buffer must finish
- // before the transfer happens.
- void CopyToHost(T* data, const size_t size) {
+ // Copy data from the GPU memory managed by this CudaBuffer instance to CPU
+ // memory. It is the caller's responsibility to ensure that the CPU memory
+ // pointer is valid, i.e. it is not null, and that it points to memory of
+ // at least this->size() size. This copy is necessarily synchronous since any
+ // potential GPU kernels that may be writing to the buffer must finish before
+ // the transfer happens.
+ void CopyToCpu(T* data, const size_t size) const {
CHECK(data_ != nullptr);
CHECK_EQ(cudaMemcpy(data, data_, size * sizeof(T), cudaMemcpyDeviceToHost),
cudaSuccess);
}
- void CopyToGpu(const std::vector<T>& data) {
- CopyToGpu(data.data(), data.size());
+ // Copy N items from another GPU memory array to the GPU memory managed by
+ // this CudaBuffer instance, growing this buffer's size if needed. This copy
+ // is asynchronous, and operates on the stream provided.
+ void CopyNItemsFrom(int n, const CudaBuffer<T>& other, cudaStream_t stream) {
+ Reserve(n);
+ CHECK(other.data_ != nullptr);
+ CHECK(data_ != nullptr);
+ CHECK_EQ(cudaMemcpyAsync(data_,
+ other.data_,
+ size_ * sizeof(T),
+ cudaMemcpyDeviceToDevice,
+ stream),
+ cudaSuccess);
}
+ // Return a pointer to the GPU memory managed by this CudaBuffer instance.
T* data() { return data_; }
const T* data() const { return data_; }
+ // Return the number of items of type T that can fit in the GPU memory
+ // allocated so far by this CudaBuffer instance.
size_t size() const { return size_; }
private:
diff --git a/internal/ceres/cuda_kernels_test.cc b/internal/ceres/cuda_kernels_test.cc
new file mode 100644
index 0000000..fbdc4b3
--- /dev/null
+++ b/internal/ceres/cuda_kernels_test.cc
@@ -0,0 +1,179 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include <math.h>
+
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "ceres/ceres_cuda_kernels.h"
+#include "ceres/cuda_buffer.h"
+#include "ceres/internal/config.h"
+#include "ceres/internal/eigen.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace ceres {
+namespace internal {
+
+#ifndef CERES_NO_CUDA
+
+TEST(CudaFP64ToFP32, SimpleConversions) {
+ std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+ CudaBuffer<double> fp64_gpu;
+ fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault);
+ CudaBuffer<float> fp32_gpu;
+ fp32_gpu.Reserve(fp64_cpu.size());
+ CudaFP64ToFP32(fp64_gpu.data(),
+ fp32_gpu.data(),
+ fp64_cpu.size(),
+ cudaStreamDefault);
+ std::vector<float> fp32_cpu(fp64_cpu.size());
+ fp32_gpu.CopyToCpu(fp32_cpu.data(), fp32_cpu.size());
+ for (int i = 0; i < fp32_cpu.size(); ++i) {
+ EXPECT_EQ(fp32_cpu[i], static_cast<float>(fp64_cpu[i]));
+ }
+}
+
+TEST(CudaFP64ToFP32, NumericallyExtremeValues) {
+ std::vector<double> fp64_cpu = {
+ DBL_MIN,
+ 10.0 * DBL_MIN,
+ DBL_MAX,
+ 0.1 * DBL_MAX
+ };
+ // First just make sure that the compiler has represented these values
+ // accurately as fp64.
+ EXPECT_GT(fp64_cpu[0], 0.0);
+ EXPECT_GT(fp64_cpu[1], 0.0);
+ EXPECT_TRUE(std::isfinite(fp64_cpu[2]));
+ EXPECT_TRUE(std::isfinite(fp64_cpu[3]));
+ CudaBuffer<double> fp64_gpu;
+ fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault);
+ CudaBuffer<float> fp32_gpu;
+ fp32_gpu.Reserve(fp64_cpu.size());
+ CudaFP64ToFP32(fp64_gpu.data(),
+ fp32_gpu.data(),
+ fp64_cpu.size(),
+ cudaStreamDefault);
+ std::vector<float> fp32_cpu(fp64_cpu.size());
+ fp32_gpu.CopyToCpu(fp32_cpu.data(), fp32_cpu.size());
+ EXPECT_EQ(fp32_cpu[0], 0.0f);
+ EXPECT_EQ(fp32_cpu[1], 0.0f);
+ EXPECT_EQ(fp32_cpu[2], std::numeric_limits<float>::infinity());
+ EXPECT_EQ(fp32_cpu[3], std::numeric_limits<float>::infinity());
+}
+
+TEST(CudaFP32ToFP64, SimpleConversions) {
+ std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+ CudaBuffer<float> fp32_gpu;
+ fp32_gpu.CopyFromCpuVector(fp32_cpu, cudaStreamDefault);
+ CudaBuffer<double> fp64_gpu;
+ fp64_gpu.Reserve(fp32_cpu.size());
+ CudaFP32ToFP64(fp32_gpu.data(),
+ fp64_gpu.data(),
+ fp32_cpu.size(),
+ cudaStreamDefault);
+ std::vector<double> fp64_cpu(fp32_cpu.size());
+ fp64_gpu.CopyToCpu(fp64_cpu.data(), fp64_cpu.size());
+ for (int i = 0; i < fp64_cpu.size(); ++i) {
+ EXPECT_EQ(fp64_cpu[i], static_cast<double>(fp32_cpu[i]));
+ }
+}
+
+TEST(CudaSetZeroFP32, NonZeroInput) {
+ std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+ CudaBuffer<float> fp32_gpu;
+ fp32_gpu.CopyFromCpuVector(fp32_cpu, cudaStreamDefault);
+ CudaSetZeroFP32(fp32_gpu.data(), fp32_cpu.size(), cudaStreamDefault);
+ std::vector<float> fp32_cpu_zero(fp32_cpu.size());
+ fp32_gpu.CopyToCpu(fp32_cpu_zero.data(), fp32_cpu_zero.size());
+ for (int i = 0; i < fp32_cpu_zero.size(); ++i) {
+ EXPECT_EQ(fp32_cpu_zero[i], 0.0f);
+ }
+}
+
+TEST(CudaSetZeroFP64, NonZeroInput) {
+ std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+ CudaBuffer<double> fp64_gpu;
+ fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault);
+ CudaSetZeroFP64(fp64_gpu.data(), fp64_cpu.size(), cudaStreamDefault);
+ std::vector<double> fp64_cpu_zero(fp64_cpu.size());
+ fp64_gpu.CopyToCpu(fp64_cpu_zero.data(), fp64_cpu_zero.size());
+ for (int i = 0; i < fp64_cpu_zero.size(); ++i) {
+ EXPECT_EQ(fp64_cpu_zero[i], 0.0);
+ }
+}
+
+TEST(CudaDsxpy, DoubleValues) {
+ std::vector<float> fp32_cpu_a = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+ std::vector<double> fp64_cpu_b =
+ {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
+ CudaBuffer<float> fp32_gpu_a;
+ fp32_gpu_a.CopyFromCpuVector(fp32_cpu_a, cudaStreamDefault);
+ CudaBuffer<double> fp64_gpu_b;
+ fp64_gpu_b.CopyFromCpuVector(fp64_cpu_b, cudaStreamDefault);
+ CudaDsxpy(fp64_gpu_b.data(),
+ fp32_gpu_a.data(),
+ fp32_gpu_a.size(),
+ cudaStreamDefault);
+ fp64_gpu_b.CopyToCpu(fp64_cpu_b.data(), fp64_cpu_b.size());
+ for (int i = 0; i < fp64_cpu_b.size(); ++i) {
+ EXPECT_DOUBLE_EQ(fp64_cpu_b[i], 2.0 * fp32_cpu_a[i]);
+ }
+}
+
+TEST(CudaDtDxpy, ComputeFourItems) {
+ std::vector<double> x_cpu = {1, 2, 3, 4};
+ std::vector<double> y_cpu = {4, 3, 2, 1};
+ std::vector<double> d_cpu = {10, 20, 30, 40};
+ CudaBuffer<double> x_gpu;
+ x_gpu.CopyFromCpuVector(x_cpu, cudaStreamDefault);
+ CudaBuffer<double> y_gpu;
+ y_gpu.CopyFromCpuVector(y_cpu, cudaStreamDefault);
+ CudaBuffer<double> d_gpu;
+ d_gpu.CopyFromCpuVector(d_cpu, cudaStreamDefault);
+ CudaDtDxpy(y_gpu.data(),
+ d_gpu.data(),
+ x_gpu.data(),
+ y_gpu.size(),
+ cudaStreamDefault);
+ y_gpu.CopyToCpu(y_cpu.data(), y_cpu.size());
+ EXPECT_DOUBLE_EQ(y_cpu[0], 4.0 + 10.0 * 10.0 * 1.0);
+ EXPECT_DOUBLE_EQ(y_cpu[1], 3.0 + 20.0 * 20.0 * 2.0);
+ EXPECT_DOUBLE_EQ(y_cpu[2], 2.0 + 30.0 * 30.0 * 3.0);
+ EXPECT_DOUBLE_EQ(y_cpu[3], 1.0 + 40.0 * 40.0 * 4.0);
+}
+
+#endif // CERES_NO_CUDA
+
+} // namespace internal
+} // namespace ceres
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index 16d3e1a..a6220e4 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc
@@ -363,7 +363,7 @@
factorize_result_ = LinearSolverTerminationType::FATAL_ERROR;
lhs_.Reserve(num_cols * num_cols);
num_cols_ = num_cols;
- lhs_.CopyToGpuAsync(lhs, num_cols * num_cols, stream_);
+ lhs_.CopyFromCpu(lhs, num_cols * num_cols, stream_);
int device_workspace_size = 0;
if (cusolverDnDpotrf_bufferSize(cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
@@ -393,7 +393,7 @@
return LinearSolverTerminationType::FATAL_ERROR;
}
int error = 0;
- error_.CopyToHost(&error, 1);
+ error_.CopyToCpu(&error, 1);
if (error < 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
<< "please report it. "
@@ -422,7 +422,7 @@
*message = "Factorize did not complete successfully previously.";
return factorize_result_;
}
- rhs_.CopyToGpuAsync(rhs, num_cols_, stream_);
+ rhs_.CopyFromCpu(rhs, num_cols_, stream_);
if (cusolverDnDpotrs(cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols_,
@@ -441,14 +441,14 @@
return LinearSolverTerminationType::FATAL_ERROR;
}
int error = 0;
- error_.CopyToHost(&error, 1);
+ error_.CopyToCpu(&error, 1);
if (error != 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
<< "Please report it."
<< "cuSolverDN::cusolverDnDpotrs fatal error. "
<< "Argument: " << -error << " is invalid.";
}
- rhs_.CopyToHost(solution, num_cols_);
+ rhs_.CopyToCpu(solution, num_cols_);
*message = "Success";
return LinearSolverTerminationType::SUCCESS;
}
@@ -533,7 +533,7 @@
return LinearSolverTerminationType::FATAL_ERROR;
}
int error = 0;
- error_.CopyToHost(&error, 1);
+ error_.CopyToCpu(&error, 1);
if (error < 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
<< "please report it. "
@@ -581,7 +581,7 @@
return LinearSolverTerminationType::FATAL_ERROR;
}
int error = 0;
- error_.CopyToHost(&error, 1);
+ error_.CopyToCpu(&error, 1);
if (error != 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
<< "Please report it."
@@ -598,7 +598,7 @@
// Copy fp64 version of lhs to GPU.
lhs_fp64_.Reserve(num_cols * num_cols);
- lhs_fp64_.CopyToGpuAsync(lhs, num_cols * num_cols, stream_);
+ lhs_fp64_.CopyFromCpu(lhs, num_cols * num_cols, stream_);
// Create an fp32 copy of lhs, lhs_fp32.
lhs_fp32_.Reserve(num_cols * num_cols);
@@ -629,8 +629,8 @@
CudaSetZeroFP64(x_fp64_.data(), num_cols_, stream_);
// Initialize residual = rhs.
- rhs_fp64_.CopyToGpuAsync(rhs, num_cols_, stream_);
- residual_fp64_.CopyFromGpuAsync(rhs_fp64_.data(), num_cols_, stream_);
+ rhs_fp64_.CopyFromCpu(rhs, num_cols_, stream_);
+ residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_, stream_);
for (int i = 0; i <= max_num_refinement_iterations_; ++i) {
// Cast residual from fp64 to fp32.
@@ -647,7 +647,7 @@
// [fp64] residual = rhs - lhs * x
// This is done in two steps:
// 1. [fp64] residual = rhs
- residual_fp64_.CopyFromGpuAsync(rhs_fp64_.data(), num_cols_, stream_);
+ residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_, stream_);
// 2. [fp64] residual = residual - lhs * x
double alpha = -1.0;
double beta = 1.0;
@@ -664,7 +664,7 @@
1);
}
}
- x_fp64_.CopyToHost(solution, num_cols_);
+ x_fp64_.CopyToCpu(solution, num_cols_);
*message = "Success.";
return LinearSolverTerminationType::SUCCESS;
}
diff --git a/internal/ceres/dense_qr.cc b/internal/ceres/dense_qr.cc
index cbe7533..775073d 100644
--- a/internal/ceres/dense_qr.cc
+++ b/internal/ceres/dense_qr.cc
@@ -331,7 +331,7 @@
tau_.Reserve(std::min(num_rows, num_cols));
num_rows_ = num_rows;
num_cols_ = num_cols;
- lhs_.CopyToGpuAsync(lhs, num_rows * num_cols, stream_);
+ lhs_.CopyFromCpu(lhs, num_rows * num_cols, stream_);
int device_workspace_size = 0;
if (cusolverDnDgeqrf_bufferSize(cusolver_handle_,
num_rows,
@@ -362,7 +362,7 @@
return LinearSolverTerminationType::FATAL_ERROR;
}
int error = 0;
- error_.CopyToHost(&error, 1);
+ error_.CopyToCpu(&error, 1);
if (error < 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
<< "please report it. "
@@ -385,7 +385,7 @@
*message = "Factorize did not complete successfully previously.";
return factorize_result_;
}
- rhs_.CopyToGpuAsync(rhs, num_rows_, stream_);
+ rhs_.CopyFromCpu(rhs, num_rows_, stream_);
int device_workspace_size = 0;
if (cusolverDnDormqr_bufferSize(cusolver_handle_,
CUBLAS_SIDE_LEFT,
@@ -424,7 +424,7 @@
return LinearSolverTerminationType::FATAL_ERROR;
}
int error = 0;
- error_.CopyToHost(&error, 1);
+ error_.CopyToCpu(&error, 1);
if (error < 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
<< "Please report it."
@@ -450,7 +450,7 @@
*message = "Cuda device synchronization failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
- rhs_.CopyToHost(solution, num_cols_);
+ rhs_.CopyToCpu(solution, num_cols_);
*message = "Success";
return LinearSolverTerminationType::SUCCESS;
}