CUDA Cleanup * All Cuda* objects now take in a ContextImpl* during construction, and save the context instead of individual handles. * Since we no longer use the legacy default stream, we need to explicitly synchronize the stream before performing GPU->CPU transfers, and CudaBuffer is responsible for such synchronization when asked to perform GPU to CPU transfers. * Remove all manual syncs and relegate syncing to CudaBuffer before performing GPU to CPU transfers. Change-Id: Ic73cb24174a1e09842827323280e90241716cc20
diff --git a/internal/ceres/context_impl.h b/internal/ceres/context_impl.h index ee0c589..d8f294a 100644 --- a/internal/ceres/context_impl.h +++ b/internal/ceres/context_impl.h
@@ -81,6 +81,13 @@ // 3. If the user explicitly selects a GPU in the host process before calling // Ceres, Ceres will use that GPU. + // Note on Ceres' use of CUDA Streams: + // All operations on the GPU are performed using a single stream. This ensures + // that the order of operations are stream-ordered, but we do not need to + // explicitly synchronize the stream at the end of every operation. Stream + // synchronization occurs only before GPU to CPU transfers, and is handled by + // CudaBuffer. + // Initializes cuBLAS, cuSOLVER, and cuSPARSE contexts, creates an // asynchronous CUDA stream, and associates the stream with the contexts. // Returns true iff initialization was successful, else it returns false and a
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h index dba1706..97d126c 100644 --- a/internal/ceres/cuda_buffer.h +++ b/internal/ceres/cuda_buffer.h
@@ -31,6 +31,7 @@ #ifndef CERES_INTERNAL_CUDA_BUFFER_H_ #define CERES_INTERNAL_CUDA_BUFFER_H_ +#include "ceres/context_impl.h" #include "ceres/internal/config.h" #ifndef CERES_NO_CUDA @@ -40,6 +41,7 @@ #include "cuda_runtime.h" #include "glog/logging.h" +namespace ceres::internal { // An encapsulated buffer to maintain GPU memory, and handle transfers between // GPU and system memory. It is the responsibility of the user to ensure that // the appropriate GPU device is selected before each subroutine is called. This @@ -49,7 +51,10 @@ template <typename T> class CudaBuffer { public: - CudaBuffer() = default; + explicit CudaBuffer(ContextImpl* context) : context_(context) {} + CudaBuffer(ContextImpl* context, int size) : context_(context) { + Reserve(size); + } CudaBuffer(const CudaBuffer&) = delete; CudaBuffer& operator=(const CudaBuffer&) = delete; @@ -75,51 +80,61 @@ // Perform an asynchronous copy from CPU memory to GPU memory managed by this // CudaBuffer instance using the stream provided. - void CopyFromCpu(const T* data, const size_t size, cudaStream_t stream) { + void CopyFromCpu(const T* data, const size_t size) { Reserve(size); - CHECK_EQ(cudaMemcpyAsync( - data_, data, size * sizeof(T), cudaMemcpyHostToDevice, stream), + CHECK_EQ(cudaMemcpyAsync(data_, + data, + size * sizeof(T), + cudaMemcpyHostToDevice, + context_->stream_), cudaSuccess); } // Perform an asynchronous copy from a vector in CPU memory to GPU memory // managed by this CudaBuffer instance. - void CopyFromCpuVector(const std::vector<T>& data, cudaStream_t stream) { + void CopyFromCpuVector(const std::vector<T>& data) { Reserve(data.size()); CHECK_EQ(cudaMemcpyAsync(data_, data.data(), data.size() * sizeof(T), cudaMemcpyHostToDevice, - stream), + context_->stream_), cudaSuccess); } // Perform an asynchronous copy from another GPU memory array to the GPU // memory managed by this CudaBuffer instance using the stream provided. - void CopyFromGPUArray(const T* data, const size_t size, cudaStream_t stream) { + void CopyFromGPUArray(const T* data, const size_t size) { Reserve(size); - CHECK_EQ( - cudaMemcpyAsync( - data_, data, size * sizeof(T), cudaMemcpyDeviceToDevice, stream), - cudaSuccess); + CHECK_EQ(cudaMemcpyAsync(data_, + data, + size * sizeof(T), + cudaMemcpyDeviceToDevice, + context_->stream_), + cudaSuccess); } // Copy data from the GPU memory managed by this CudaBuffer instance to CPU // memory. It is the caller's responsibility to ensure that the CPU memory // pointer is valid, i.e. it is not null, and that it points to memory of - // at least this->size() size. This copy is necessarily synchronous since any - // potential GPU kernels that may be writing to the buffer must finish before - // the transfer happens. + // at least this->size() size. This method ensures all previously dispatched + // GPU operations on the specified stream have completed before copying the + // data to CPU memory. void CopyToCpu(T* data, const size_t size) const { CHECK(data_ != nullptr); - CHECK_EQ(cudaMemcpy(data, data_, size * sizeof(T), cudaMemcpyDeviceToHost), + CHECK_EQ(cudaMemcpyAsync(data, + data_, + size * sizeof(T), + cudaMemcpyDeviceToHost, + context_->stream_), cudaSuccess); + CHECK_EQ(cudaStreamSynchronize(context_->stream_), cudaSuccess); } // Copy N items from another GPU memory array to the GPU memory managed by // this CudaBuffer instance, growing this buffer's size if needed. This copy // is asynchronous, and operates on the stream provided. - void CopyNItemsFrom(int n, const CudaBuffer<T>& other, cudaStream_t stream) { + void CopyNItemsFrom(int n, const CudaBuffer<T>& other) { Reserve(n); CHECK(other.data_ != nullptr); CHECK(data_ != nullptr); @@ -127,7 +142,7 @@ other.data_, size_ * sizeof(T), cudaMemcpyDeviceToDevice, - stream), + context_->stream_), cudaSuccess); } @@ -141,7 +156,9 @@ private: T* data_ = nullptr; size_t size_ = 0; + ContextImpl* context_ = nullptr; }; +} // namespace ceres::internal #endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_kernels_test.cc b/internal/ceres/cuda_kernels_test.cc index 053b442..9290aa5 100644 --- a/internal/ceres/cuda_kernels_test.cc +++ b/internal/ceres/cuda_kernels_test.cc
@@ -36,6 +36,7 @@ #include <string> #include <vector> +#include "ceres/context_impl.h" #include "ceres/cuda_buffer.h" #include "ceres/internal/config.h" #include "ceres/internal/eigen.h" @@ -48,10 +49,13 @@ #ifndef CERES_NO_CUDA TEST(CudaFP64ToFP32, SimpleConversions) { + ContextImpl context; + std::string cuda_error; + EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error; std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}; - CudaBuffer<double> fp64_gpu; - fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault); - CudaBuffer<float> fp32_gpu; + CudaBuffer<double> fp64_gpu(&context); + fp64_gpu.CopyFromCpuVector(fp64_cpu); + CudaBuffer<float> fp32_gpu(&context); fp32_gpu.Reserve(fp64_cpu.size()); CudaFP64ToFP32( fp64_gpu.data(), fp32_gpu.data(), fp64_cpu.size(), cudaStreamDefault); @@ -63,6 +67,9 @@ } TEST(CudaFP64ToFP32, NumericallyExtremeValues) { + ContextImpl context; + std::string cuda_error; + EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error; std::vector<double> fp64_cpu = { DBL_MIN, 10.0 * DBL_MIN, DBL_MAX, 0.1 * DBL_MAX}; // First just make sure that the compiler has represented these values @@ -71,9 +78,9 @@ EXPECT_GT(fp64_cpu[1], 0.0); EXPECT_TRUE(std::isfinite(fp64_cpu[2])); EXPECT_TRUE(std::isfinite(fp64_cpu[3])); - CudaBuffer<double> fp64_gpu; - fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault); - CudaBuffer<float> fp32_gpu; + CudaBuffer<double> fp64_gpu(&context); + fp64_gpu.CopyFromCpuVector(fp64_cpu); + CudaBuffer<float> fp32_gpu(&context); fp32_gpu.Reserve(fp64_cpu.size()); CudaFP64ToFP32( fp64_gpu.data(), fp32_gpu.data(), fp64_cpu.size(), cudaStreamDefault); @@ -86,10 +93,13 @@ } TEST(CudaFP32ToFP64, SimpleConversions) { + ContextImpl context; + std::string cuda_error; + EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error; std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}; - CudaBuffer<float> fp32_gpu; - fp32_gpu.CopyFromCpuVector(fp32_cpu, cudaStreamDefault); - CudaBuffer<double> fp64_gpu; + CudaBuffer<float> fp32_gpu(&context); + fp32_gpu.CopyFromCpuVector(fp32_cpu); + CudaBuffer<double> fp64_gpu(&context); fp64_gpu.Reserve(fp32_cpu.size()); CudaFP32ToFP64( fp32_gpu.data(), fp64_gpu.data(), fp32_cpu.size(), cudaStreamDefault); @@ -101,9 +111,12 @@ } TEST(CudaSetZeroFP32, NonZeroInput) { + ContextImpl context; + std::string cuda_error; + EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error; std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}; - CudaBuffer<float> fp32_gpu; - fp32_gpu.CopyFromCpuVector(fp32_cpu, cudaStreamDefault); + CudaBuffer<float> fp32_gpu(&context); + fp32_gpu.CopyFromCpuVector(fp32_cpu); CudaSetZeroFP32(fp32_gpu.data(), fp32_cpu.size(), cudaStreamDefault); std::vector<float> fp32_cpu_zero(fp32_cpu.size()); fp32_gpu.CopyToCpu(fp32_cpu_zero.data(), fp32_cpu_zero.size()); @@ -113,9 +126,12 @@ } TEST(CudaSetZeroFP64, NonZeroInput) { + ContextImpl context; + std::string cuda_error; + EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error; std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}; - CudaBuffer<double> fp64_gpu; - fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault); + CudaBuffer<double> fp64_gpu(&context); + fp64_gpu.CopyFromCpuVector(fp64_cpu); CudaSetZeroFP64(fp64_gpu.data(), fp64_cpu.size(), cudaStreamDefault); std::vector<double> fp64_cpu_zero(fp64_cpu.size()); fp64_gpu.CopyToCpu(fp64_cpu_zero.data(), fp64_cpu_zero.size()); @@ -125,13 +141,16 @@ } TEST(CudaDsxpy, DoubleValues) { + ContextImpl context; + std::string cuda_error; + EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error; std::vector<float> fp32_cpu_a = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}; std::vector<double> fp64_cpu_b = { 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}; - CudaBuffer<float> fp32_gpu_a; - fp32_gpu_a.CopyFromCpuVector(fp32_cpu_a, cudaStreamDefault); - CudaBuffer<double> fp64_gpu_b; - fp64_gpu_b.CopyFromCpuVector(fp64_cpu_b, cudaStreamDefault); + CudaBuffer<float> fp32_gpu_a(&context); + fp32_gpu_a.CopyFromCpuVector(fp32_cpu_a); + CudaBuffer<double> fp64_gpu_b(&context); + fp64_gpu_b.CopyFromCpuVector(fp64_cpu_b); CudaDsxpy(fp64_gpu_b.data(), fp32_gpu_a.data(), fp32_gpu_a.size(), @@ -143,15 +162,18 @@ } TEST(CudaDtDxpy, ComputeFourItems) { + ContextImpl context; + std::string cuda_error; + EXPECT_TRUE(context.InitCuda(&cuda_error)) << cuda_error; std::vector<double> x_cpu = {1, 2, 3, 4}; std::vector<double> y_cpu = {4, 3, 2, 1}; std::vector<double> d_cpu = {10, 20, 30, 40}; - CudaBuffer<double> x_gpu; - x_gpu.CopyFromCpuVector(x_cpu, cudaStreamDefault); - CudaBuffer<double> y_gpu; - y_gpu.CopyFromCpuVector(y_cpu, cudaStreamDefault); - CudaBuffer<double> d_gpu; - d_gpu.CopyFromCpuVector(d_cpu, cudaStreamDefault); + CudaBuffer<double> x_gpu(&context); + x_gpu.CopyFromCpuVector(x_cpu); + CudaBuffer<double> y_gpu(&context); + y_gpu.CopyFromCpuVector(y_cpu); + CudaBuffer<double> d_gpu(&context); + d_gpu.CopyFromCpuVector(d_cpu); CudaDtDxpy(y_gpu.data(), d_gpu.data(), x_gpu.data(),
diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc index 1e361d2..7ee1761 100644 --- a/internal/ceres/cuda_sparse_matrix.cc +++ b/internal/ceres/cuda_sparse_matrix.cc
@@ -59,17 +59,21 @@ namespace ceres::internal { -CudaSparseMatrix::CudaSparseMatrix( - ContextImpl* context, const CompressedRowSparseMatrix& crs_matrix) { +CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context, + const CompressedRowSparseMatrix& crs_matrix) + : context_(context), + rows_{context}, + cols_{context}, + values_{context}, + spmv_buffer_{context} { DCHECK_NE(context, nullptr); CHECK(context->IsCudaInitialized()); - context_ = context; num_rows_ = crs_matrix.num_rows(); num_cols_ = crs_matrix.num_cols(); num_nonzeros_ = crs_matrix.num_nonzeros(); - rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1, context_->stream_); - cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_, context_->stream_); - values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_, context_->stream_); + rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1); + cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_); + values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_); cusparseCreateCsr(&descr_, num_rows_, num_cols_, @@ -96,7 +100,7 @@ CHECK_EQ(num_rows_, crs_matrix.num_rows()); CHECK_EQ(num_cols_, crs_matrix.num_cols()); CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros()); - values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_, context_->stream_); + values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_); } void CudaSparseMatrix::SpMv(cusparseOperation_t op,
diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h index 62f6b77..d5f3332 100644 --- a/internal/ceres/cuda_sparse_matrix.h +++ b/internal/ceres/cuda_sparse_matrix.h
@@ -97,6 +97,7 @@ int num_cols_ = 0; int num_nonzeros_ = 0; + ContextImpl* context_ = nullptr; // CSR row indices. CudaBuffer<int32_t> rows_; // CSR column indices. @@ -104,8 +105,6 @@ // CSR values. CudaBuffer<double> values_; - ContextImpl* context_ = nullptr; - // CuSparse object that describes this matrix. cusparseSpMatDescr_t descr_ = nullptr;
diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc index 46e6cb2..274a47a 100644 --- a/internal/ceres/cuda_vector.cc +++ b/internal/ceres/cuda_vector.cc
@@ -50,17 +50,17 @@ namespace ceres::internal { -CudaVector::CudaVector(ContextImpl* context, int size) { +CudaVector::CudaVector(ContextImpl* context, int size) + : context_(context), data_(context, size) { DCHECK_NE(context, nullptr); - CHECK(context->IsCudaInitialized()); - context_ = context; + DCHECK(context->IsCudaInitialized()); Resize(size); } CudaVector& CudaVector::operator=(const CudaVector& other) { if (this != &other) { Resize(other.num_rows()); - data_.CopyFromGPUArray(other.data_.data(), num_rows_, context_->stream_); + data_.CopyFromGPUArray(other.data_.data(), num_rows_); } return *this; } @@ -107,7 +107,7 @@ void CudaVector::CopyFromCpu(const Vector& x) { data_.Reserve(x.rows()); - data_.CopyFromCpu(x.data(), x.rows(), context_->stream_); + data_.CopyFromCpu(x.data(), x.rows()); num_rows_ = x.rows(); DestroyDescriptor(); CHECK_EQ(cusparseCreateDnVec(&descr_, num_rows_, data_.data(), CUDA_R_64F), @@ -117,17 +117,11 @@ void CudaVector::CopyTo(Vector* x) const { CHECK(x != nullptr); x->resize(num_rows_); - // Need to synchronize with any GPU kernels that may be writing to the - // buffer before the transfer happens. - CHECK_EQ(cudaStreamSynchronize(context_->stream_), cudaSuccess); data_.CopyToCpu(x->data(), num_rows_); } void CudaVector::CopyTo(double* x) const { CHECK(x != nullptr); - // Need to synchronize with any GPU kernels that may be writing to the - // buffer before the transfer happens. - CHECK_EQ(cudaStreamSynchronize(context_->stream_), cudaSuccess); data_.CopyToCpu(x, num_rows_); }
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc index 6d328ab..93e9f0d 100644 --- a/internal/ceres/dense_cholesky.cc +++ b/internal/ceres/dense_cholesky.cc
@@ -347,15 +347,12 @@ #ifndef CERES_NO_CUDA -bool CUDADenseCholesky::Init(ContextImpl* context, std::string* message) { - CHECK(context->IsCudaInitialized()) - << "CUDADenseCholesky requires CUDA initialization."; - cusolver_handle_ = context->cusolver_handle_; - stream_ = context->stream_; - error_.Reserve(1); - *message = "CUDADenseCholesky::Init Success."; - return true; -} +CUDADenseCholesky::CUDADenseCholesky(ContextImpl* context) + : context_(context), + lhs_{context}, + rhs_{context}, + device_workspace_{context}, + error_(context, 1) {} LinearSolverTerminationType CUDADenseCholesky::Factorize(int num_cols, double* lhs, @@ -363,9 +360,9 @@ factorize_result_ = LinearSolverTerminationType::FATAL_ERROR; lhs_.Reserve(num_cols * num_cols); num_cols_ = num_cols; - lhs_.CopyFromCpu(lhs, num_cols * num_cols, stream_); + lhs_.CopyFromCpu(lhs, num_cols * num_cols); int device_workspace_size = 0; - if (cusolverDnDpotrf_bufferSize(cusolver_handle_, + if (cusolverDnDpotrf_bufferSize(context_->cusolver_handle_, CUBLAS_FILL_MODE_LOWER, num_cols, lhs_.data(), @@ -376,7 +373,7 @@ return LinearSolverTerminationType::FATAL_ERROR; } device_workspace_.Reserve(device_workspace_size); - if (cusolverDnDpotrf(cusolver_handle_, + if (cusolverDnDpotrf(context_->cusolver_handle_, CUBLAS_FILL_MODE_LOWER, num_cols, lhs_.data(), @@ -387,11 +384,6 @@ *message = "cuSolverDN::cusolverDnDpotrf failed."; return LinearSolverTerminationType::FATAL_ERROR; } - if (cudaDeviceSynchronize() != cudaSuccess || - cudaStreamSynchronize(stream_) != cudaSuccess) { - *message = "Cuda device synchronization failed."; - return LinearSolverTerminationType::FATAL_ERROR; - } int error = 0; error_.CopyToCpu(&error, 1); if (error < 0) { @@ -422,8 +414,8 @@ *message = "Factorize did not complete successfully previously."; return factorize_result_; } - rhs_.CopyFromCpu(rhs, num_cols_, stream_); - if (cusolverDnDpotrs(cusolver_handle_, + rhs_.CopyFromCpu(rhs, num_cols_); + if (cusolverDnDpotrs(context_->cusolver_handle_, CUBLAS_FILL_MODE_LOWER, num_cols_, 1, @@ -435,11 +427,6 @@ *message = "cuSolverDN::cusolverDnDpotrs failed."; return LinearSolverTerminationType::FATAL_ERROR; } - if (cudaDeviceSynchronize() != cudaSuccess || - cudaStreamSynchronize(stream_) != cudaSuccess) { - *message = "Cuda device synchronization failed."; - return LinearSolverTerminationType::FATAL_ERROR; - } int error = 0; error_.CopyToCpu(&error, 1); if (error != 0) { @@ -455,56 +442,30 @@ std::unique_ptr<CUDADenseCholesky> CUDADenseCholesky::Create( const LinearSolver::Options& options) { - if (options.dense_linear_algebra_library_type != CUDA) { - // The user called the wrong factory method. + if (options.dense_linear_algebra_library_type != CUDA || + options.context == nullptr || !options.context->IsCudaInitialized()) { return nullptr; } - auto cuda_dense_cholesky = - std::unique_ptr<CUDADenseCholesky>(new CUDADenseCholesky()); - std::string cuda_error; - if (cuda_dense_cholesky->Init(options.context, &cuda_error)) { - return cuda_dense_cholesky; - } - // Initialization failed, destroy the object (done automatically) and return - // a nullptr. - LOG(ERROR) << "CUDADenseCholesky::Init failed: " << cuda_error; - return nullptr; + return std::unique_ptr<CUDADenseCholesky>( + new CUDADenseCholesky(options.context)); } std::unique_ptr<CUDADenseCholeskyMixedPrecision> CUDADenseCholeskyMixedPrecision::Create(const LinearSolver::Options& options) { if (options.dense_linear_algebra_library_type != CUDA || - !options.use_mixed_precision_solves) { - // The user called the wrong factory method. + !options.use_mixed_precision_solves || options.context == nullptr || + !options.context->IsCudaInitialized()) { return nullptr; } - auto solver = std::unique_ptr<CUDADenseCholeskyMixedPrecision>( - new CUDADenseCholeskyMixedPrecision()); - std::string cuda_error; - if (solver->Init(options, &cuda_error)) { - return solver; - } - LOG(ERROR) << "CUDADenseCholeskyMixedPrecision::Init failed: " << cuda_error; - return nullptr; -} - -bool CUDADenseCholeskyMixedPrecision::Init(const LinearSolver::Options& options, - std::string* message) { - CHECK(options.context->IsCudaInitialized()) - << "CUDADenseCholeskyMixedPrecision requires CUDA initialization."; - cusolver_handle_ = options.context->cusolver_handle_; - cublas_handle_ = options.context->cublas_handle_; - stream_ = options.context->stream_; - error_.Reserve(1); - max_num_refinement_iterations_ = options.max_num_refinement_iterations; - *message = "CUDADenseCholeskyMixedPrecision::Init Success."; - return true; + return std::unique_ptr<CUDADenseCholeskyMixedPrecision>( + new CUDADenseCholeskyMixedPrecision( + options.context, options.max_num_refinement_iterations)); } LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::CudaCholeskyFactorize(std::string* message) { int device_workspace_size = 0; - if (cusolverDnSpotrf_bufferSize(cusolver_handle_, + if (cusolverDnSpotrf_bufferSize(context_->cusolver_handle_, CUBLAS_FILL_MODE_LOWER, num_cols_, lhs_fp32_.data(), @@ -515,7 +476,7 @@ return LinearSolverTerminationType::FATAL_ERROR; } device_workspace_.Reserve(device_workspace_size); - if (cusolverDnSpotrf(cusolver_handle_, + if (cusolverDnSpotrf(context_->cusolver_handle_, CUBLAS_FILL_MODE_LOWER, num_cols_, lhs_fp32_.data(), @@ -526,11 +487,6 @@ *message = "cuSolverDN::cusolverDnSpotrf failed."; return LinearSolverTerminationType::FATAL_ERROR; } - if (cudaDeviceSynchronize() != cudaSuccess || - cudaStreamSynchronize(stream_) != cudaSuccess) { - *message = "Cuda device synchronization failed."; - return LinearSolverTerminationType::FATAL_ERROR; - } int error = 0; error_.CopyToCpu(&error, 1); if (error < 0) { @@ -560,9 +516,9 @@ residual_fp32_.data(), num_cols_ * sizeof(float), cudaMemcpyDeviceToDevice, - stream_), + context_->stream_), cudaSuccess); - if (cusolverDnSpotrs(cusolver_handle_, + if (cusolverDnSpotrs(context_->cusolver_handle_, CUBLAS_FILL_MODE_LOWER, num_cols_, 1, @@ -574,11 +530,6 @@ *message = "cuSolverDN::cusolverDnDpotrs failed."; return LinearSolverTerminationType::FATAL_ERROR; } - if (cudaDeviceSynchronize() != cudaSuccess || - cudaStreamSynchronize(stream_) != cudaSuccess) { - *message = "Cuda device synchronization failed."; - return LinearSolverTerminationType::FATAL_ERROR; - } int error = 0; error_.CopyToCpu(&error, 1); if (error != 0) { @@ -591,18 +542,34 @@ return LinearSolverTerminationType::SUCCESS; } +CUDADenseCholeskyMixedPrecision::CUDADenseCholeskyMixedPrecision( + ContextImpl* context, int max_num_refinement_iterations) + : context_(context), + lhs_fp64_{context}, + rhs_fp64_{context}, + lhs_fp32_{context}, + device_workspace_{context}, + error_(context, 1), + x_fp64_{context}, + correction_fp32_{context}, + residual_fp32_{context}, + residual_fp64_{context}, + max_num_refinement_iterations_(max_num_refinement_iterations) {} + LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::Factorize( int num_cols, double* lhs, std::string* message) { num_cols_ = num_cols; // Copy fp64 version of lhs to GPU. lhs_fp64_.Reserve(num_cols * num_cols); - lhs_fp64_.CopyFromCpu(lhs, num_cols * num_cols, stream_); + lhs_fp64_.CopyFromCpu(lhs, num_cols * num_cols); // Create an fp32 copy of lhs, lhs_fp32. lhs_fp32_.Reserve(num_cols * num_cols); - CudaFP64ToFP32( - lhs_fp64_.data(), lhs_fp32_.data(), num_cols * num_cols, stream_); + CudaFP64ToFP32(lhs_fp64_.data(), + lhs_fp32_.data(), + num_cols * num_cols, + context_->stream_); // Factorize lhs_fp32. factorize_result_ = CudaCholeskyFactorize(message); @@ -625,32 +592,35 @@ residual_fp64_.Reserve(num_cols_); // Initialize x = 0. - CudaSetZeroFP64(x_fp64_.data(), num_cols_, stream_); + CudaSetZeroFP64(x_fp64_.data(), num_cols_, context_->stream_); // Initialize residual = rhs. - rhs_fp64_.CopyFromCpu(rhs, num_cols_, stream_); - residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_, stream_); + rhs_fp64_.CopyFromCpu(rhs, num_cols_); + residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_); for (int i = 0; i <= max_num_refinement_iterations_; ++i) { // Cast residual from fp64 to fp32. - CudaFP64ToFP32( - residual_fp64_.data(), residual_fp32_.data(), num_cols_, stream_); + CudaFP64ToFP32(residual_fp64_.data(), + residual_fp32_.data(), + num_cols_, + context_->stream_); // [fp32] c = lhs^-1 * residual. auto result = CudaCholeskySolve(message); if (result != LinearSolverTerminationType::SUCCESS) { return result; } // [fp64] x += c. - CudaDsxpy(x_fp64_.data(), correction_fp32_.data(), num_cols_, stream_); + CudaDsxpy( + x_fp64_.data(), correction_fp32_.data(), num_cols_, context_->stream_); if (i < max_num_refinement_iterations_) { // [fp64] residual = rhs - lhs * x // This is done in two steps: // 1. [fp64] residual = rhs - residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_, stream_); + residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_); // 2. [fp64] residual = residual - lhs * x double alpha = -1.0; double beta = 1.0; - cublasDsymv(cublas_handle_, + cublasDsymv(context_->cublas_handle_, CUBLAS_FILL_MODE_LOWER, num_cols_, &alpha,
diff --git a/internal/ceres/dense_cholesky.h b/internal/ceres/dense_cholesky.h index 1c561c1..6380fc6 100644 --- a/internal/ceres/dense_cholesky.h +++ b/internal/ceres/dense_cholesky.h
@@ -40,6 +40,7 @@ #include <vector> #include "Eigen/Dense" +#include "ceres/context_impl.h" #include "ceres/cuda_buffer.h" #include "ceres/linear_solver.h" #include "glog/logging.h" @@ -208,16 +209,9 @@ std::string* message) override; private: - CUDADenseCholesky() = default; - // Picks up the cuSolverDN and cuStream handles from the context. If - // the context is unable to initialize CUDA, returns false with a - // human-readable message indicating the reason. - bool Init(ContextImpl* context, std::string* message); + explicit CUDADenseCholesky(ContextImpl* context); - // Handle to the cuSOLVER context. - cusolverDnHandle_t cusolver_handle_ = nullptr; - // CUDA device stream. - cudaStream_t stream_ = nullptr; + ContextImpl* context_ = nullptr; // Number of columns in the A matrix, to be cached between calls to *Factorize // and *Solve. size_t num_cols_ = 0; @@ -266,7 +260,9 @@ std::string* message) override; private: - CUDADenseCholeskyMixedPrecision() = default; + CUDADenseCholeskyMixedPrecision(ContextImpl* context, + int max_num_refinement_iterations); + // Helper function to wrap Cuda boilerplate needed to call Spotrf. LinearSolverTerminationType CudaCholeskyFactorize(std::string* message); // Helper function to wrap Cuda boilerplate needed to call Spotrs. @@ -277,9 +273,7 @@ // human-readable message indicating the reason. bool Init(const LinearSolver::Options& options, std::string* message); - cusolverDnHandle_t cusolver_handle_ = nullptr; - cublasHandle_t cublas_handle_ = nullptr; - cudaStream_t stream_ = nullptr; + ContextImpl* context_ = nullptr; // Number of columns in the A matrix, to be cached between calls to *Factorize // and *Solve. size_t num_cols_ = 0;
diff --git a/internal/ceres/dense_qr.cc b/internal/ceres/dense_qr.cc index 22727f8..5154fb1 100644 --- a/internal/ceres/dense_qr.cc +++ b/internal/ceres/dense_qr.cc
@@ -311,17 +311,13 @@ #ifndef CERES_NO_CUDA -bool CUDADenseQR::Init(ContextImpl* context, std::string* message) { - if (!context->InitCuda(message)) { - return false; - } - cublas_handle_ = context->cublas_handle_; - cusolver_handle_ = context->cusolver_handle_; - stream_ = context->stream_; - error_.Reserve(1); - *message = "CUDADenseQR::Init Success."; - return true; -} +CUDADenseQR::CUDADenseQR(ContextImpl* context) + : context_(context), + lhs_{context}, + rhs_{context}, + tau_{context}, + device_workspace_{context}, + error_(context, 1) {} LinearSolverTerminationType CUDADenseQR::Factorize(int num_rows, int num_cols, @@ -332,9 +328,9 @@ tau_.Reserve(std::min(num_rows, num_cols)); num_rows_ = num_rows; num_cols_ = num_cols; - lhs_.CopyFromCpu(lhs, num_rows * num_cols, stream_); + lhs_.CopyFromCpu(lhs, num_rows * num_cols); int device_workspace_size = 0; - if (cusolverDnDgeqrf_bufferSize(cusolver_handle_, + if (cusolverDnDgeqrf_bufferSize(context_->cusolver_handle_, num_rows, num_cols, lhs_.data(), @@ -345,7 +341,7 @@ return LinearSolverTerminationType::FATAL_ERROR; } device_workspace_.Reserve(device_workspace_size); - if (cusolverDnDgeqrf(cusolver_handle_, + if (cusolverDnDgeqrf(context_->cusolver_handle_, num_rows, num_cols, lhs_.data(), @@ -357,11 +353,6 @@ *message = "cuSolverDN::cusolverDnDgeqrf failed."; return LinearSolverTerminationType::FATAL_ERROR; } - if (cudaDeviceSynchronize() != cudaSuccess || - cudaStreamSynchronize(stream_) != cudaSuccess) { - *message = "Cuda device synchronization failed."; - return LinearSolverTerminationType::FATAL_ERROR; - } int error = 0; error_.CopyToCpu(&error, 1); if (error < 0) { @@ -386,9 +377,9 @@ *message = "Factorize did not complete successfully previously."; return factorize_result_; } - rhs_.CopyFromCpu(rhs, num_rows_, stream_); + rhs_.CopyFromCpu(rhs, num_rows_); int device_workspace_size = 0; - if (cusolverDnDormqr_bufferSize(cusolver_handle_, + if (cusolverDnDormqr_bufferSize(context_->cusolver_handle_, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, num_rows_, @@ -407,7 +398,7 @@ device_workspace_.Reserve(device_workspace_size); // Compute rhs = Q^T * rhs, assuming that lhs has already been factorized. // The result of factorization would have stored Q in a packed form in lhs_. - if (cusolverDnDormqr(cusolver_handle_, + if (cusolverDnDormqr(context_->cusolver_handle_, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, num_rows_, @@ -434,7 +425,7 @@ } // Compute the solution vector as x = R \ (Q^T * rhs). Since the previous step // replaced rhs by (Q^T * rhs), this is just x = R \ rhs. - if (cublasDtrsv(cublas_handle_, + if (cublasDtrsv(context_->cublas_handle_, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, @@ -446,11 +437,6 @@ *message = "cuBLAS::cublasDtrsv failed."; return LinearSolverTerminationType::FATAL_ERROR; } - if (cudaDeviceSynchronize() != cudaSuccess || - cudaStreamSynchronize(stream_) != cudaSuccess) { - *message = "Cuda device synchronization failed."; - return LinearSolverTerminationType::FATAL_ERROR; - } rhs_.CopyToCpu(solution, num_cols_); *message = "Success"; return LinearSolverTerminationType::SUCCESS; @@ -458,23 +444,13 @@ std::unique_ptr<CUDADenseQR> CUDADenseQR::Create( const LinearSolver::Options& options) { - if (options.dense_linear_algebra_library_type != CUDA) { - // The user called the wrong factory method. + if (options.dense_linear_algebra_library_type != CUDA || + options.context == nullptr || !options.context->IsCudaInitialized()) { return nullptr; } - auto cuda_dense_qr = std::unique_ptr<CUDADenseQR>(new CUDADenseQR()); - std::string cuda_error; - if (cuda_dense_qr->Init(options.context, &cuda_error)) { - return cuda_dense_qr; - } - // Initialization failed, destroy the object (done automatically) and return a - // nullptr. - LOG(ERROR) << "CUDADenseQR::Init failed: " << cuda_error; - return nullptr; + return std::unique_ptr<CUDADenseQR>(new CUDADenseQR(options.context)); } -CUDADenseQR::CUDADenseQR() = default; - #endif // CERES_NO_CUDA } // namespace ceres::internal
diff --git a/internal/ceres/dense_qr.h b/internal/ceres/dense_qr.h index 0d2577a..270a912 100644 --- a/internal/ceres/dense_qr.h +++ b/internal/ceres/dense_qr.h
@@ -40,6 +40,7 @@ #include <vector> #include "Eigen/Dense" +#include "ceres/context_impl.h" #include "ceres/internal/disable_warnings.h" #include "ceres/internal/eigen.h" #include "ceres/internal/export.h" @@ -164,18 +165,9 @@ std::string* message) override; private: - CUDADenseQR(); - // Picks up the cuSolverDN, cuBLAS, and cuStream handles from the context. If - // the context is unable to initialize CUDA, returns false with a - // human-readable message indicating the reason. - bool Init(ContextImpl* context, std::string* message); + explicit CUDADenseQR(ContextImpl* context); - // Handle to the cuSOLVER context. - cusolverDnHandle_t cusolver_handle_ = nullptr; - // Handle to cuBLAS context. - cublasHandle_t cublas_handle_ = nullptr; - // CUDA device stream. - cudaStream_t stream_ = nullptr; + ContextImpl* context_ = nullptr; // Number of rowns in the A matrix, to be cached between calls to *Factorize // and *Solve. size_t num_rows_ = 0;