CUDA Cleanup
* All Cuda* objects now take in a ContextImpl* during
construction, and save the context instead of individual
handles.
* Since we no longer use the legacy default stream, we need to
explicitly synchronize the stream before performing GPU->CPU
transfers, and CudaBuffer is responsible for such synchronization
when asked to perform GPU to CPU transfers.
* Remove all manual syncs and relegate syncing to CudaBuffer
before performing GPU to CPU transfers.
Change-Id: Ic73cb24174a1e09842827323280e90241716cc20
diff --git a/internal/ceres/context_impl.h b/internal/ceres/context_impl.h
index ee0c589..d8f294a 100644
--- a/internal/ceres/context_impl.h
+++ b/internal/ceres/context_impl.h
@@ -81,6 +81,13 @@
// 3. If the user explicitly selects a GPU in the host process before calling
// Ceres, Ceres will use that GPU.
+ // Note on Ceres' use of CUDA Streams:
+ // All operations on the GPU are performed using a single stream. This ensures
+ // that the order of operations are stream-ordered, but we do not need to
+ // explicitly synchronize the stream at the end of every operation. Stream
+ // synchronization occurs only before GPU to CPU transfers, and is handled by
+ // CudaBuffer.
+
// Initializes cuBLAS, cuSOLVER, and cuSPARSE contexts, creates an
// asynchronous CUDA stream, and associates the stream with the contexts.
// Returns true iff initialization was successful, else it returns false and a
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h
index dba1706..97d126c 100644
--- a/internal/ceres/cuda_buffer.h
+++ b/internal/ceres/cuda_buffer.h
@@ -31,6 +31,7 @@
#ifndef CERES_INTERNAL_CUDA_BUFFER_H_
#define CERES_INTERNAL_CUDA_BUFFER_H_
+#include "ceres/context_impl.h"
#include "ceres/internal/config.h"
#ifndef CERES_NO_CUDA
@@ -40,6 +41,7 @@
#include "cuda_runtime.h"
#include "glog/logging.h"
+namespace ceres::internal {
// An encapsulated buffer to maintain GPU memory, and handle transfers between
// GPU and system memory. It is the responsibility of the user to ensure that
// the appropriate GPU device is selected before each subroutine is called. This
@@ -49,7 +51,10 @@
template <typename T>
class CudaBuffer {
public:
- CudaBuffer() = default;
+ explicit CudaBuffer(ContextImpl* context) : context_(context) {}
+ CudaBuffer(ContextImpl* context, int size) : context_(context) {
+ Reserve(size);
+ }
CudaBuffer(const CudaBuffer&) = delete;
CudaBuffer& operator=(const CudaBuffer&) = delete;
@@ -75,51 +80,61 @@
// Perform an asynchronous copy from CPU memory to GPU memory managed by this
// CudaBuffer instance using the stream provided.
- void CopyFromCpu(const T* data, const size_t size, cudaStream_t stream) {
+ void CopyFromCpu(const T* data, const size_t size) {
Reserve(size);
- CHECK_EQ(cudaMemcpyAsync(
- data_, data, size * sizeof(T), cudaMemcpyHostToDevice, stream),
+ CHECK_EQ(cudaMemcpyAsync(data_,
+ data,
+ size * sizeof(T),
+ cudaMemcpyHostToDevice,
+ context_->stream_),
cudaSuccess);
}
// Perform an asynchronous copy from a vector in CPU memory to GPU memory
// managed by this CudaBuffer instance.
- void CopyFromCpuVector(const std::vector<T>& data, cudaStream_t stream) {
+ void CopyFromCpuVector(const std::vector<T>& data) {
Reserve(data.size());
CHECK_EQ(cudaMemcpyAsync(data_,
data.data(),
data.size() * sizeof(T),
cudaMemcpyHostToDevice,
- stream),
+ context_->stream_),
cudaSuccess);
}
// Perform an asynchronous copy from another GPU memory array to the GPU
// memory managed by this CudaBuffer instance using the stream provided.
- void CopyFromGPUArray(const T* data, const size_t size, cudaStream_t stream) {
+ void CopyFromGPUArray(const T* data, const size_t size) {
Reserve(size);
- CHECK_EQ(
- cudaMemcpyAsync(
- data_, data, size * sizeof(T), cudaMemcpyDeviceToDevice, stream),
- cudaSuccess);
+ CHECK_EQ(cudaMemcpyAsync(data_,
+ data,
+ size * sizeof(T),
+ cudaMemcpyDeviceToDevice,
+ context_->stream_),
+ cudaSuccess);
}
// Copy data from the GPU memory managed by this CudaBuffer instance to CPU
// memory. It is the caller's responsibility to ensure that the CPU memory
// pointer is valid, i.e. it is not null, and that it points to memory of
- // at least this->size() size. This copy is necessarily synchronous since any
- // potential GPU kernels that may be writing to the buffer must finish before
- // the transfer happens.
+ // at least this->size() size. This method ensures all previously dispatched
+ // GPU operations on the specified stream have completed before copying the
+ // data to CPU memory.
void CopyToCpu(T* data, const size_t size) const {
CHECK(data_ != nullptr);
- CHECK_EQ(cudaMemcpy(data, data_, size * sizeof(T), cudaMemcpyDeviceToHost),
+ CHECK_EQ(cudaMemcpyAsync(data,
+ data_,
+ size * sizeof(T),
+ cudaMemcpyDeviceToHost,
+ context_->stream_),
cudaSuccess);
+ CHECK_EQ(cudaStreamSynchronize(context_->stream_), cudaSuccess);
}
// Copy N items from another GPU memory array to the GPU memory managed by
// this CudaBuffer instance, growing this buffer's size if needed. This copy
// is asynchronous, and operates on the stream provided.
- void CopyNItemsFrom(int n, const CudaBuffer<T>& other, cudaStream_t stream) {
+ void CopyNItemsFrom(int n, const CudaBuffer<T>& other) {
Reserve(n);
CHECK(other.data_ != nullptr);
CHECK(data_ != nullptr);
@@ -127,7 +142,7 @@
other.data_,
size_ * sizeof(T),
cudaMemcpyDeviceToDevice,
- stream),
+ context_->stream_),
cudaSuccess);
}
@@ -141,7 +156,9 @@
private:
T* data_ = nullptr;
size_t size_ = 0;
+ ContextImpl* context_ = nullptr;
};
+} // namespace ceres::internal
#endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_kernels_test.cc b/internal/ceres/cuda_kernels_test.cc
index 053b442..9290aa5 100644
--- a/internal/ceres/cuda_kernels_test.cc
+++ b/internal/ceres/cuda_kernels_test.cc
@@ -36,6 +36,7 @@
#include <string>
#include <vector>
+#include "ceres/context_impl.h"
#include "ceres/cuda_buffer.h"
#include "ceres/internal/config.h"
#include "ceres/internal/eigen.h"
@@ -48,10 +49,13 @@
#ifndef CERES_NO_CUDA
TEST(CudaFP64ToFP32, SimpleConversions) {
+ ContextImpl context;
+ std::string cuda_error;
+ EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
- CudaBuffer<double> fp64_gpu;
- fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault);
- CudaBuffer<float> fp32_gpu;
+ CudaBuffer<double> fp64_gpu(&context);
+ fp64_gpu.CopyFromCpuVector(fp64_cpu);
+ CudaBuffer<float> fp32_gpu(&context);
fp32_gpu.Reserve(fp64_cpu.size());
CudaFP64ToFP32(
fp64_gpu.data(), fp32_gpu.data(), fp64_cpu.size(), cudaStreamDefault);
@@ -63,6 +67,9 @@
}
TEST(CudaFP64ToFP32, NumericallyExtremeValues) {
+ ContextImpl context;
+ std::string cuda_error;
+ EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
std::vector<double> fp64_cpu = {
DBL_MIN, 10.0 * DBL_MIN, DBL_MAX, 0.1 * DBL_MAX};
// First just make sure that the compiler has represented these values
@@ -71,9 +78,9 @@
EXPECT_GT(fp64_cpu[1], 0.0);
EXPECT_TRUE(std::isfinite(fp64_cpu[2]));
EXPECT_TRUE(std::isfinite(fp64_cpu[3]));
- CudaBuffer<double> fp64_gpu;
- fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault);
- CudaBuffer<float> fp32_gpu;
+ CudaBuffer<double> fp64_gpu(&context);
+ fp64_gpu.CopyFromCpuVector(fp64_cpu);
+ CudaBuffer<float> fp32_gpu(&context);
fp32_gpu.Reserve(fp64_cpu.size());
CudaFP64ToFP32(
fp64_gpu.data(), fp32_gpu.data(), fp64_cpu.size(), cudaStreamDefault);
@@ -86,10 +93,13 @@
}
TEST(CudaFP32ToFP64, SimpleConversions) {
+ ContextImpl context;
+ std::string cuda_error;
+ EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
- CudaBuffer<float> fp32_gpu;
- fp32_gpu.CopyFromCpuVector(fp32_cpu, cudaStreamDefault);
- CudaBuffer<double> fp64_gpu;
+ CudaBuffer<float> fp32_gpu(&context);
+ fp32_gpu.CopyFromCpuVector(fp32_cpu);
+ CudaBuffer<double> fp64_gpu(&context);
fp64_gpu.Reserve(fp32_cpu.size());
CudaFP32ToFP64(
fp32_gpu.data(), fp64_gpu.data(), fp32_cpu.size(), cudaStreamDefault);
@@ -101,9 +111,12 @@
}
TEST(CudaSetZeroFP32, NonZeroInput) {
+ ContextImpl context;
+ std::string cuda_error;
+ EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
- CudaBuffer<float> fp32_gpu;
- fp32_gpu.CopyFromCpuVector(fp32_cpu, cudaStreamDefault);
+ CudaBuffer<float> fp32_gpu(&context);
+ fp32_gpu.CopyFromCpuVector(fp32_cpu);
CudaSetZeroFP32(fp32_gpu.data(), fp32_cpu.size(), cudaStreamDefault);
std::vector<float> fp32_cpu_zero(fp32_cpu.size());
fp32_gpu.CopyToCpu(fp32_cpu_zero.data(), fp32_cpu_zero.size());
@@ -113,9 +126,12 @@
}
TEST(CudaSetZeroFP64, NonZeroInput) {
+ ContextImpl context;
+ std::string cuda_error;
+ EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
- CudaBuffer<double> fp64_gpu;
- fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault);
+ CudaBuffer<double> fp64_gpu(&context);
+ fp64_gpu.CopyFromCpuVector(fp64_cpu);
CudaSetZeroFP64(fp64_gpu.data(), fp64_cpu.size(), cudaStreamDefault);
std::vector<double> fp64_cpu_zero(fp64_cpu.size());
fp64_gpu.CopyToCpu(fp64_cpu_zero.data(), fp64_cpu_zero.size());
@@ -125,13 +141,16 @@
}
TEST(CudaDsxpy, DoubleValues) {
+ ContextImpl context;
+ std::string cuda_error;
+ EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
std::vector<float> fp32_cpu_a = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
std::vector<double> fp64_cpu_b = {
1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
- CudaBuffer<float> fp32_gpu_a;
- fp32_gpu_a.CopyFromCpuVector(fp32_cpu_a, cudaStreamDefault);
- CudaBuffer<double> fp64_gpu_b;
- fp64_gpu_b.CopyFromCpuVector(fp64_cpu_b, cudaStreamDefault);
+ CudaBuffer<float> fp32_gpu_a(&context);
+ fp32_gpu_a.CopyFromCpuVector(fp32_cpu_a);
+ CudaBuffer<double> fp64_gpu_b(&context);
+ fp64_gpu_b.CopyFromCpuVector(fp64_cpu_b);
CudaDsxpy(fp64_gpu_b.data(),
fp32_gpu_a.data(),
fp32_gpu_a.size(),
@@ -143,15 +162,18 @@
}
TEST(CudaDtDxpy, ComputeFourItems) {
+ ContextImpl context;
+ std::string cuda_error;
+ EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error;
std::vector<double> x_cpu = {1, 2, 3, 4};
std::vector<double> y_cpu = {4, 3, 2, 1};
std::vector<double> d_cpu = {10, 20, 30, 40};
- CudaBuffer<double> x_gpu;
- x_gpu.CopyFromCpuVector(x_cpu, cudaStreamDefault);
- CudaBuffer<double> y_gpu;
- y_gpu.CopyFromCpuVector(y_cpu, cudaStreamDefault);
- CudaBuffer<double> d_gpu;
- d_gpu.CopyFromCpuVector(d_cpu, cudaStreamDefault);
+ CudaBuffer<double> x_gpu(&context);
+ x_gpu.CopyFromCpuVector(x_cpu);
+ CudaBuffer<double> y_gpu(&context);
+ y_gpu.CopyFromCpuVector(y_cpu);
+ CudaBuffer<double> d_gpu(&context);
+ d_gpu.CopyFromCpuVector(d_cpu);
CudaDtDxpy(y_gpu.data(),
d_gpu.data(),
x_gpu.data(),
diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc
index 1e361d2..7ee1761 100644
--- a/internal/ceres/cuda_sparse_matrix.cc
+++ b/internal/ceres/cuda_sparse_matrix.cc
@@ -59,17 +59,21 @@
namespace ceres::internal {
-CudaSparseMatrix::CudaSparseMatrix(
- ContextImpl* context, const CompressedRowSparseMatrix& crs_matrix) {
+CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context,
+ const CompressedRowSparseMatrix& crs_matrix)
+ : context_(context),
+ rows_{context},
+ cols_{context},
+ values_{context},
+ spmv_buffer_{context} {
DCHECK_NE(context, nullptr);
CHECK(context->IsCudaInitialized());
- context_ = context;
num_rows_ = crs_matrix.num_rows();
num_cols_ = crs_matrix.num_cols();
num_nonzeros_ = crs_matrix.num_nonzeros();
- rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1, context_->stream_);
- cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_, context_->stream_);
- values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_, context_->stream_);
+ rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1);
+ cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_);
+ values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
cusparseCreateCsr(&descr_,
num_rows_,
num_cols_,
@@ -96,7 +100,7 @@
CHECK_EQ(num_rows_, crs_matrix.num_rows());
CHECK_EQ(num_cols_, crs_matrix.num_cols());
CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros());
- values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_, context_->stream_);
+ values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
}
void CudaSparseMatrix::SpMv(cusparseOperation_t op,
diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h
index 62f6b77..d5f3332 100644
--- a/internal/ceres/cuda_sparse_matrix.h
+++ b/internal/ceres/cuda_sparse_matrix.h
@@ -97,6 +97,7 @@
int num_cols_ = 0;
int num_nonzeros_ = 0;
+ ContextImpl* context_ = nullptr;
// CSR row indices.
CudaBuffer<int32_t> rows_;
// CSR column indices.
@@ -104,8 +105,6 @@
// CSR values.
CudaBuffer<double> values_;
- ContextImpl* context_ = nullptr;
-
// CuSparse object that describes this matrix.
cusparseSpMatDescr_t descr_ = nullptr;
diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc
index 46e6cb2..274a47a 100644
--- a/internal/ceres/cuda_vector.cc
+++ b/internal/ceres/cuda_vector.cc
@@ -50,17 +50,17 @@
namespace ceres::internal {
-CudaVector::CudaVector(ContextImpl* context, int size) {
+CudaVector::CudaVector(ContextImpl* context, int size)
+ : context_(context), data_(context, size) {
DCHECK_NE(context, nullptr);
- CHECK(context->IsCudaInitialized());
- context_ = context;
+ DCHECK(context->IsCudaInitialized());
Resize(size);
}
CudaVector& CudaVector::operator=(const CudaVector& other) {
if (this != &other) {
Resize(other.num_rows());
- data_.CopyFromGPUArray(other.data_.data(), num_rows_, context_->stream_);
+ data_.CopyFromGPUArray(other.data_.data(), num_rows_);
}
return *this;
}
@@ -107,7 +107,7 @@
void CudaVector::CopyFromCpu(const Vector& x) {
data_.Reserve(x.rows());
- data_.CopyFromCpu(x.data(), x.rows(), context_->stream_);
+ data_.CopyFromCpu(x.data(), x.rows());
num_rows_ = x.rows();
DestroyDescriptor();
CHECK_EQ(cusparseCreateDnVec(&descr_, num_rows_, data_.data(), CUDA_R_64F),
@@ -117,17 +117,11 @@
void CudaVector::CopyTo(Vector* x) const {
CHECK(x != nullptr);
x->resize(num_rows_);
- // Need to synchronize with any GPU kernels that may be writing to the
- // buffer before the transfer happens.
- CHECK_EQ(cudaStreamSynchronize(context_->stream_), cudaSuccess);
data_.CopyToCpu(x->data(), num_rows_);
}
void CudaVector::CopyTo(double* x) const {
CHECK(x != nullptr);
- // Need to synchronize with any GPU kernels that may be writing to the
- // buffer before the transfer happens.
- CHECK_EQ(cudaStreamSynchronize(context_->stream_), cudaSuccess);
data_.CopyToCpu(x, num_rows_);
}
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index 6d328ab..93e9f0d 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc
@@ -347,15 +347,12 @@
#ifndef CERES_NO_CUDA
-bool CUDADenseCholesky::Init(ContextImpl* context, std::string* message) {
- CHECK(context->IsCudaInitialized())
- << "CUDADenseCholesky requires CUDA initialization.";
- cusolver_handle_ = context->cusolver_handle_;
- stream_ = context->stream_;
- error_.Reserve(1);
- *message = "CUDADenseCholesky::Init Success.";
- return true;
-}
+CUDADenseCholesky::CUDADenseCholesky(ContextImpl* context)
+ : context_(context),
+ lhs_{context},
+ rhs_{context},
+ device_workspace_{context},
+ error_(context, 1) {}
LinearSolverTerminationType CUDADenseCholesky::Factorize(int num_cols,
double* lhs,
@@ -363,9 +360,9 @@
factorize_result_ = LinearSolverTerminationType::FATAL_ERROR;
lhs_.Reserve(num_cols * num_cols);
num_cols_ = num_cols;
- lhs_.CopyFromCpu(lhs, num_cols * num_cols, stream_);
+ lhs_.CopyFromCpu(lhs, num_cols * num_cols);
int device_workspace_size = 0;
- if (cusolverDnDpotrf_bufferSize(cusolver_handle_,
+ if (cusolverDnDpotrf_bufferSize(context_->cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols,
lhs_.data(),
@@ -376,7 +373,7 @@
return LinearSolverTerminationType::FATAL_ERROR;
}
device_workspace_.Reserve(device_workspace_size);
- if (cusolverDnDpotrf(cusolver_handle_,
+ if (cusolverDnDpotrf(context_->cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols,
lhs_.data(),
@@ -387,11 +384,6 @@
*message = "cuSolverDN::cusolverDnDpotrf failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
- if (cudaDeviceSynchronize() != cudaSuccess ||
- cudaStreamSynchronize(stream_) != cudaSuccess) {
- *message = "Cuda device synchronization failed.";
- return LinearSolverTerminationType::FATAL_ERROR;
- }
int error = 0;
error_.CopyToCpu(&error, 1);
if (error < 0) {
@@ -422,8 +414,8 @@
*message = "Factorize did not complete successfully previously.";
return factorize_result_;
}
- rhs_.CopyFromCpu(rhs, num_cols_, stream_);
- if (cusolverDnDpotrs(cusolver_handle_,
+ rhs_.CopyFromCpu(rhs, num_cols_);
+ if (cusolverDnDpotrs(context_->cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols_,
1,
@@ -435,11 +427,6 @@
*message = "cuSolverDN::cusolverDnDpotrs failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
- if (cudaDeviceSynchronize() != cudaSuccess ||
- cudaStreamSynchronize(stream_) != cudaSuccess) {
- *message = "Cuda device synchronization failed.";
- return LinearSolverTerminationType::FATAL_ERROR;
- }
int error = 0;
error_.CopyToCpu(&error, 1);
if (error != 0) {
@@ -455,56 +442,30 @@
std::unique_ptr<CUDADenseCholesky> CUDADenseCholesky::Create(
const LinearSolver::Options& options) {
- if (options.dense_linear_algebra_library_type != CUDA) {
- // The user called the wrong factory method.
+ if (options.dense_linear_algebra_library_type != CUDA ||
+ options.context == nullptr || !options.context->IsCudaInitialized()) {
return nullptr;
}
- auto cuda_dense_cholesky =
- std::unique_ptr<CUDADenseCholesky>(new CUDADenseCholesky());
- std::string cuda_error;
- if (cuda_dense_cholesky->Init(options.context, &cuda_error)) {
- return cuda_dense_cholesky;
- }
- // Initialization failed, destroy the object (done automatically) and return
- // a nullptr.
- LOG(ERROR) << "CUDADenseCholesky::Init failed: " << cuda_error;
- return nullptr;
+ return std::unique_ptr<CUDADenseCholesky>(
+ new CUDADenseCholesky(options.context));
}
std::unique_ptr<CUDADenseCholeskyMixedPrecision>
CUDADenseCholeskyMixedPrecision::Create(const LinearSolver::Options& options) {
if (options.dense_linear_algebra_library_type != CUDA ||
- !options.use_mixed_precision_solves) {
- // The user called the wrong factory method.
+ !options.use_mixed_precision_solves || options.context == nullptr ||
+ !options.context->IsCudaInitialized()) {
return nullptr;
}
- auto solver = std::unique_ptr<CUDADenseCholeskyMixedPrecision>(
- new CUDADenseCholeskyMixedPrecision());
- std::string cuda_error;
- if (solver->Init(options, &cuda_error)) {
- return solver;
- }
- LOG(ERROR) << "CUDADenseCholeskyMixedPrecision::Init failed: " << cuda_error;
- return nullptr;
-}
-
-bool CUDADenseCholeskyMixedPrecision::Init(const LinearSolver::Options& options,
- std::string* message) {
- CHECK(options.context->IsCudaInitialized())
- << "CUDADenseCholeskyMixedPrecision requires CUDA initialization.";
- cusolver_handle_ = options.context->cusolver_handle_;
- cublas_handle_ = options.context->cublas_handle_;
- stream_ = options.context->stream_;
- error_.Reserve(1);
- max_num_refinement_iterations_ = options.max_num_refinement_iterations;
- *message = "CUDADenseCholeskyMixedPrecision::Init Success.";
- return true;
+ return std::unique_ptr<CUDADenseCholeskyMixedPrecision>(
+ new CUDADenseCholeskyMixedPrecision(
+ options.context, options.max_num_refinement_iterations));
}
LinearSolverTerminationType
CUDADenseCholeskyMixedPrecision::CudaCholeskyFactorize(std::string* message) {
int device_workspace_size = 0;
- if (cusolverDnSpotrf_bufferSize(cusolver_handle_,
+ if (cusolverDnSpotrf_bufferSize(context_->cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols_,
lhs_fp32_.data(),
@@ -515,7 +476,7 @@
return LinearSolverTerminationType::FATAL_ERROR;
}
device_workspace_.Reserve(device_workspace_size);
- if (cusolverDnSpotrf(cusolver_handle_,
+ if (cusolverDnSpotrf(context_->cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols_,
lhs_fp32_.data(),
@@ -526,11 +487,6 @@
*message = "cuSolverDN::cusolverDnSpotrf failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
- if (cudaDeviceSynchronize() != cudaSuccess ||
- cudaStreamSynchronize(stream_) != cudaSuccess) {
- *message = "Cuda device synchronization failed.";
- return LinearSolverTerminationType::FATAL_ERROR;
- }
int error = 0;
error_.CopyToCpu(&error, 1);
if (error < 0) {
@@ -560,9 +516,9 @@
residual_fp32_.data(),
num_cols_ * sizeof(float),
cudaMemcpyDeviceToDevice,
- stream_),
+ context_->stream_),
cudaSuccess);
- if (cusolverDnSpotrs(cusolver_handle_,
+ if (cusolverDnSpotrs(context_->cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols_,
1,
@@ -574,11 +530,6 @@
*message = "cuSolverDN::cusolverDnDpotrs failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
- if (cudaDeviceSynchronize() != cudaSuccess ||
- cudaStreamSynchronize(stream_) != cudaSuccess) {
- *message = "Cuda device synchronization failed.";
- return LinearSolverTerminationType::FATAL_ERROR;
- }
int error = 0;
error_.CopyToCpu(&error, 1);
if (error != 0) {
@@ -591,18 +542,34 @@
return LinearSolverTerminationType::SUCCESS;
}
+CUDADenseCholeskyMixedPrecision::CUDADenseCholeskyMixedPrecision(
+ ContextImpl* context, int max_num_refinement_iterations)
+ : context_(context),
+ lhs_fp64_{context},
+ rhs_fp64_{context},
+ lhs_fp32_{context},
+ device_workspace_{context},
+ error_(context, 1),
+ x_fp64_{context},
+ correction_fp32_{context},
+ residual_fp32_{context},
+ residual_fp64_{context},
+ max_num_refinement_iterations_(max_num_refinement_iterations) {}
+
LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::Factorize(
int num_cols, double* lhs, std::string* message) {
num_cols_ = num_cols;
// Copy fp64 version of lhs to GPU.
lhs_fp64_.Reserve(num_cols * num_cols);
- lhs_fp64_.CopyFromCpu(lhs, num_cols * num_cols, stream_);
+ lhs_fp64_.CopyFromCpu(lhs, num_cols * num_cols);
// Create an fp32 copy of lhs, lhs_fp32.
lhs_fp32_.Reserve(num_cols * num_cols);
- CudaFP64ToFP32(
- lhs_fp64_.data(), lhs_fp32_.data(), num_cols * num_cols, stream_);
+ CudaFP64ToFP32(lhs_fp64_.data(),
+ lhs_fp32_.data(),
+ num_cols * num_cols,
+ context_->stream_);
// Factorize lhs_fp32.
factorize_result_ = CudaCholeskyFactorize(message);
@@ -625,32 +592,35 @@
residual_fp64_.Reserve(num_cols_);
// Initialize x = 0.
- CudaSetZeroFP64(x_fp64_.data(), num_cols_, stream_);
+ CudaSetZeroFP64(x_fp64_.data(), num_cols_, context_->stream_);
// Initialize residual = rhs.
- rhs_fp64_.CopyFromCpu(rhs, num_cols_, stream_);
- residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_, stream_);
+ rhs_fp64_.CopyFromCpu(rhs, num_cols_);
+ residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_);
for (int i = 0; i <= max_num_refinement_iterations_; ++i) {
// Cast residual from fp64 to fp32.
- CudaFP64ToFP32(
- residual_fp64_.data(), residual_fp32_.data(), num_cols_, stream_);
+ CudaFP64ToFP32(residual_fp64_.data(),
+ residual_fp32_.data(),
+ num_cols_,
+ context_->stream_);
// [fp32] c = lhs^-1 * residual.
auto result = CudaCholeskySolve(message);
if (result != LinearSolverTerminationType::SUCCESS) {
return result;
}
// [fp64] x += c.
- CudaDsxpy(x_fp64_.data(), correction_fp32_.data(), num_cols_, stream_);
+ CudaDsxpy(
+ x_fp64_.data(), correction_fp32_.data(), num_cols_, context_->stream_);
if (i < max_num_refinement_iterations_) {
// [fp64] residual = rhs - lhs * x
// This is done in two steps:
// 1. [fp64] residual = rhs
- residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_, stream_);
+ residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_);
// 2. [fp64] residual = residual - lhs * x
double alpha = -1.0;
double beta = 1.0;
- cublasDsymv(cublas_handle_,
+ cublasDsymv(context_->cublas_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols_,
&alpha,
diff --git a/internal/ceres/dense_cholesky.h b/internal/ceres/dense_cholesky.h
index 1c561c1..6380fc6 100644
--- a/internal/ceres/dense_cholesky.h
+++ b/internal/ceres/dense_cholesky.h
@@ -40,6 +40,7 @@
#include <vector>
#include "Eigen/Dense"
+#include "ceres/context_impl.h"
#include "ceres/cuda_buffer.h"
#include "ceres/linear_solver.h"
#include "glog/logging.h"
@@ -208,16 +209,9 @@
std::string* message) override;
private:
- CUDADenseCholesky() = default;
- // Picks up the cuSolverDN and cuStream handles from the context. If
- // the context is unable to initialize CUDA, returns false with a
- // human-readable message indicating the reason.
- bool Init(ContextImpl* context, std::string* message);
+ explicit CUDADenseCholesky(ContextImpl* context);
- // Handle to the cuSOLVER context.
- cusolverDnHandle_t cusolver_handle_ = nullptr;
- // CUDA device stream.
- cudaStream_t stream_ = nullptr;
+ ContextImpl* context_ = nullptr;
// Number of columns in the A matrix, to be cached between calls to *Factorize
// and *Solve.
size_t num_cols_ = 0;
@@ -266,7 +260,9 @@
std::string* message) override;
private:
- CUDADenseCholeskyMixedPrecision() = default;
+ CUDADenseCholeskyMixedPrecision(ContextImpl* context,
+ int max_num_refinement_iterations);
+
// Helper function to wrap Cuda boilerplate needed to call Spotrf.
LinearSolverTerminationType CudaCholeskyFactorize(std::string* message);
// Helper function to wrap Cuda boilerplate needed to call Spotrs.
@@ -277,9 +273,7 @@
// human-readable message indicating the reason.
bool Init(const LinearSolver::Options& options, std::string* message);
- cusolverDnHandle_t cusolver_handle_ = nullptr;
- cublasHandle_t cublas_handle_ = nullptr;
- cudaStream_t stream_ = nullptr;
+ ContextImpl* context_ = nullptr;
// Number of columns in the A matrix, to be cached between calls to *Factorize
// and *Solve.
size_t num_cols_ = 0;
diff --git a/internal/ceres/dense_qr.cc b/internal/ceres/dense_qr.cc
index 22727f8..5154fb1 100644
--- a/internal/ceres/dense_qr.cc
+++ b/internal/ceres/dense_qr.cc
@@ -311,17 +311,13 @@
#ifndef CERES_NO_CUDA
-bool CUDADenseQR::Init(ContextImpl* context, std::string* message) {
- if (!context->InitCuda(message)) {
- return false;
- }
- cublas_handle_ = context->cublas_handle_;
- cusolver_handle_ = context->cusolver_handle_;
- stream_ = context->stream_;
- error_.Reserve(1);
- *message = "CUDADenseQR::Init Success.";
- return true;
-}
+CUDADenseQR::CUDADenseQR(ContextImpl* context)
+ : context_(context),
+ lhs_{context},
+ rhs_{context},
+ tau_{context},
+ device_workspace_{context},
+ error_(context, 1) {}
LinearSolverTerminationType CUDADenseQR::Factorize(int num_rows,
int num_cols,
@@ -332,9 +328,9 @@
tau_.Reserve(std::min(num_rows, num_cols));
num_rows_ = num_rows;
num_cols_ = num_cols;
- lhs_.CopyFromCpu(lhs, num_rows * num_cols, stream_);
+ lhs_.CopyFromCpu(lhs, num_rows * num_cols);
int device_workspace_size = 0;
- if (cusolverDnDgeqrf_bufferSize(cusolver_handle_,
+ if (cusolverDnDgeqrf_bufferSize(context_->cusolver_handle_,
num_rows,
num_cols,
lhs_.data(),
@@ -345,7 +341,7 @@
return LinearSolverTerminationType::FATAL_ERROR;
}
device_workspace_.Reserve(device_workspace_size);
- if (cusolverDnDgeqrf(cusolver_handle_,
+ if (cusolverDnDgeqrf(context_->cusolver_handle_,
num_rows,
num_cols,
lhs_.data(),
@@ -357,11 +353,6 @@
*message = "cuSolverDN::cusolverDnDgeqrf failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
- if (cudaDeviceSynchronize() != cudaSuccess ||
- cudaStreamSynchronize(stream_) != cudaSuccess) {
- *message = "Cuda device synchronization failed.";
- return LinearSolverTerminationType::FATAL_ERROR;
- }
int error = 0;
error_.CopyToCpu(&error, 1);
if (error < 0) {
@@ -386,9 +377,9 @@
*message = "Factorize did not complete successfully previously.";
return factorize_result_;
}
- rhs_.CopyFromCpu(rhs, num_rows_, stream_);
+ rhs_.CopyFromCpu(rhs, num_rows_);
int device_workspace_size = 0;
- if (cusolverDnDormqr_bufferSize(cusolver_handle_,
+ if (cusolverDnDormqr_bufferSize(context_->cusolver_handle_,
CUBLAS_SIDE_LEFT,
CUBLAS_OP_T,
num_rows_,
@@ -407,7 +398,7 @@
device_workspace_.Reserve(device_workspace_size);
// Compute rhs = Q^T * rhs, assuming that lhs has already been factorized.
// The result of factorization would have stored Q in a packed form in lhs_.
- if (cusolverDnDormqr(cusolver_handle_,
+ if (cusolverDnDormqr(context_->cusolver_handle_,
CUBLAS_SIDE_LEFT,
CUBLAS_OP_T,
num_rows_,
@@ -434,7 +425,7 @@
}
// Compute the solution vector as x = R \ (Q^T * rhs). Since the previous step
// replaced rhs by (Q^T * rhs), this is just x = R \ rhs.
- if (cublasDtrsv(cublas_handle_,
+ if (cublasDtrsv(context_->cublas_handle_,
CUBLAS_FILL_MODE_UPPER,
CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT,
@@ -446,11 +437,6 @@
*message = "cuBLAS::cublasDtrsv failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
- if (cudaDeviceSynchronize() != cudaSuccess ||
- cudaStreamSynchronize(stream_) != cudaSuccess) {
- *message = "Cuda device synchronization failed.";
- return LinearSolverTerminationType::FATAL_ERROR;
- }
rhs_.CopyToCpu(solution, num_cols_);
*message = "Success";
return LinearSolverTerminationType::SUCCESS;
@@ -458,23 +444,13 @@
std::unique_ptr<CUDADenseQR> CUDADenseQR::Create(
const LinearSolver::Options& options) {
- if (options.dense_linear_algebra_library_type != CUDA) {
- // The user called the wrong factory method.
+ if (options.dense_linear_algebra_library_type != CUDA ||
+ options.context == nullptr || !options.context->IsCudaInitialized()) {
return nullptr;
}
- auto cuda_dense_qr = std::unique_ptr<CUDADenseQR>(new CUDADenseQR());
- std::string cuda_error;
- if (cuda_dense_qr->Init(options.context, &cuda_error)) {
- return cuda_dense_qr;
- }
- // Initialization failed, destroy the object (done automatically) and return a
- // nullptr.
- LOG(ERROR) << "CUDADenseQR::Init failed: " << cuda_error;
- return nullptr;
+ return std::unique_ptr<CUDADenseQR>(new CUDADenseQR(options.context));
}
-CUDADenseQR::CUDADenseQR() = default;
-
#endif // CERES_NO_CUDA
} // namespace ceres::internal
diff --git a/internal/ceres/dense_qr.h b/internal/ceres/dense_qr.h
index 0d2577a..270a912 100644
--- a/internal/ceres/dense_qr.h
+++ b/internal/ceres/dense_qr.h
@@ -40,6 +40,7 @@
#include <vector>
#include "Eigen/Dense"
+#include "ceres/context_impl.h"
#include "ceres/internal/disable_warnings.h"
#include "ceres/internal/eigen.h"
#include "ceres/internal/export.h"
@@ -164,18 +165,9 @@
std::string* message) override;
private:
- CUDADenseQR();
- // Picks up the cuSolverDN, cuBLAS, and cuStream handles from the context. If
- // the context is unable to initialize CUDA, returns false with a
- // human-readable message indicating the reason.
- bool Init(ContextImpl* context, std::string* message);
+ explicit CUDADenseQR(ContextImpl* context);
- // Handle to the cuSOLVER context.
- cusolverDnHandle_t cusolver_handle_ = nullptr;
- // Handle to cuBLAS context.
- cublasHandle_t cublas_handle_ = nullptr;
- // CUDA device stream.
- cudaStream_t stream_ = nullptr;
+ ContextImpl* context_ = nullptr;
// Number of rowns in the A matrix, to be cached between calls to *Factorize
// and *Solve.
size_t num_rows_ = 0;