Add support for dense CUDA solvers #3 1. Add CUDA initialization and cleanup management to the ContextImpl object. The ContextImpl is now solely responsible for managing CUDA-related resources. 2. All CUDA dense solvers now use lazy CUDA initialization via the ContextImpl object. Change-Id: Ief456860c72e462367ee997d389c19e2bff50baf
diff --git a/internal/ceres/context_impl.cc b/internal/ceres/context_impl.cc index 1acf724..ed239a5 100644 --- a/internal/ceres/context_impl.cc +++ b/internal/ceres/context_impl.cc
@@ -30,11 +30,73 @@ #include "ceres/context_impl.h" +#include <string> + +#ifndef CERES_NO_CUDA +#include "cuda_runtime.h" +#include "cublas_v2.h" +#include "cusolverDn.h" +#endif // CERES_NO_CUDA + namespace ceres { namespace internal { ContextImpl::ContextImpl() = default; +#ifndef CERES_NO_CUDA +bool ContextImpl::InitCUDA(std::string* message) { + if (cuda_initialized_) { + return true; + } + if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { + *message = "cuBLAS::cublasCreate failed."; + cublas_handle_ = nullptr; + return false; + } + if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnCreate failed."; + cusolver_handle_ = nullptr; + cublasDestroy(cublas_handle_); + cublas_handle_ = nullptr; + return false; + } + if (cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking) != + cudaSuccess) { + *message = "CUDA::cudaStreamCreateWithFlags failed."; + cusolverDnDestroy(cusolver_handle_); + cublasDestroy(cublas_handle_); + cusolver_handle_ = nullptr; + cublas_handle_ = nullptr; + stream_ = nullptr; + return false; + } + if (cusolverDnSetStream(cusolver_handle_, stream_) != + CUSOLVER_STATUS_SUCCESS || + cublasSetStream(cublas_handle_, stream_) != CUBLAS_STATUS_SUCCESS) { + *message = + "cuSolverDN::cusolverDnSetStream or cuBLAS::cublasSetStream failed."; + cusolverDnDestroy(cusolver_handle_); + cublasDestroy(cublas_handle_); + cudaStreamDestroy(stream_); + cusolver_handle_ = nullptr; + cublas_handle_ = nullptr; + stream_ = nullptr; + return false; + } + cuda_initialized_ = true; + return true; +} +#endif // CERES_NO_CUDA + +ContextImpl::~ContextImpl() { +#ifndef CERES_NO_CUDA + if (cuda_initialized_) { + cusolverDnDestroy(cusolver_handle_); + cublasDestroy(cublas_handle_); + cudaStreamDestroy(stream_); + } +#endif // CERES_NO_CUDA +} void ContextImpl::EnsureMinimumThreads(int num_threads) { #ifdef CERES_USE_CXX_THREADS thread_pool.Resize(num_threads);
diff --git a/internal/ceres/context_impl.h b/internal/ceres/context_impl.h index 7d1e6d3..1944549 100644 --- a/internal/ceres/context_impl.h +++ b/internal/ceres/context_impl.h
@@ -40,6 +40,12 @@ #include "ceres/internal/disable_warnings.h" #include "ceres/internal/export.h" +#ifndef CERES_NO_CUDA +#include "cuda_runtime.h" +#include "cublas_v2.h" +#include "cusolverDn.h" +#endif // CERES_NO_CUDA + #ifdef CERES_USE_CXX_THREADS #include "ceres/thread_pool.h" #endif // CERES_USE_CXX_THREADS @@ -50,6 +56,7 @@ class CERES_NO_EXPORT ContextImpl : public Context { public: ContextImpl(); + ~ContextImpl() override; ContextImpl(const ContextImpl&) = delete; void operator=(const ContextImpl&) = delete; @@ -62,6 +69,23 @@ #ifdef CERES_USE_CXX_THREADS ThreadPool thread_pool; #endif // CERES_USE_CXX_THREADS + +#ifndef CERES_NO_CUDA + // Initializes the cuSolverDN context, creates an asynchronous stream, and + // associates the stream with cuSolverDN. Returns true iff initialization was + // successful, else it returns false and a human-readable error message is + // returned. + bool InitCUDA(std::string* message); + + // Handle to the cuSOLVER context. + cusolverDnHandle_t cusolver_handle_ = nullptr; + // Handle to cuBLAS context. + cublasHandle_t cublas_handle_ = nullptr; + // CUDA device stream. + cudaStream_t stream_ = nullptr; + // Indicates whether all the CUDA resources have been initialized. + bool cuda_initialized_ = false; +#endif // CERES_NO_CUDA }; } // namespace internal
diff --git a/internal/ceres/cuda_dense_cholesky_test.cc b/internal/ceres/cuda_dense_cholesky_test.cc index 6c4dcc3..cca97d8 100644 --- a/internal/ceres/cuda_dense_cholesky_test.cc +++ b/internal/ceres/cuda_dense_cholesky_test.cc
@@ -43,6 +43,8 @@ TEST(CUDADenseCholesky, InvalidOptionOnCreate) { LinearSolver::Options options; + ContextImpl context; + options.context = &context; auto dense_cuda_solver = CUDADenseCholesky::Create(options); EXPECT_EQ(dense_cuda_solver, nullptr); } @@ -56,6 +58,8 @@ 0, 0, 0, 1; const Eigen::Vector4d b = Eigen::Vector4d::Ones(); LinearSolver::Options options; + ContextImpl context; + options.context = &context; options.dense_linear_algebra_library_type = CUDA; auto dense_cuda_solver = CUDADenseCholesky::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); @@ -80,6 +84,8 @@ 0, 0, 0; const Eigen::Vector3d b = Eigen::Vector3d::Ones(); LinearSolver::Options options; + ContextImpl context; + options.context = &context; options.dense_linear_algebra_library_type = CUDA; auto dense_cuda_solver = CUDADenseCholesky::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); @@ -97,6 +103,8 @@ 0, 0, -1; const Eigen::Vector3d b = Eigen::Vector3d::Ones(); LinearSolver::Options options; + ContextImpl context; + options.context = &context; options.dense_linear_algebra_library_type = CUDA; auto dense_cuda_solver = CUDADenseCholesky::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); @@ -110,6 +118,8 @@ TEST(CUDADenseCholesky, MustFactorizeBeforeSolve) { const Eigen::Vector3d b = Eigen::Vector3d::Ones(); LinearSolver::Options options; + ContextImpl context; + options.context = &context; options.dense_linear_algebra_library_type = CUDA; auto dense_cuda_solver = CUDADenseCholesky::Create(options); ASSERT_NE(dense_cuda_solver, nullptr);
diff --git a/internal/ceres/cuda_dense_qr_test.cc b/internal/ceres/cuda_dense_qr_test.cc index 15ba00a..6a64298 100644 --- a/internal/ceres/cuda_dense_qr_test.cc +++ b/internal/ceres/cuda_dense_qr_test.cc
@@ -56,6 +56,8 @@ 0, 0, 0, 1; const Eigen::Vector4d b = Eigen::Vector4d::Ones(); LinearSolver::Options options; + ContextImpl context; + options.context = &context; options.dense_linear_algebra_library_type = CUDA; auto dense_cuda_solver = CUDADenseQR::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); @@ -85,6 +87,8 @@ 0, 0; const std::vector<double> b(4, 1.0); LinearSolver::Options options; + ContextImpl context; + options.context = &context; options.dense_linear_algebra_library_type = CUDA; auto dense_cuda_solver = CUDADenseQR::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); @@ -107,6 +111,8 @@ TEST(CUDADenseQR, MustFactorizeBeforeSolve) { const Eigen::Vector3d b = Eigen::Vector3d::Ones(); LinearSolver::Options options; + ContextImpl context; + options.context = &context; options.dense_linear_algebra_library_type = CUDA; auto dense_cuda_solver = CUDADenseQR::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); @@ -123,6 +129,8 @@ using SolutionType = Eigen::Matrix<double, Eigen::Dynamic, 1>; LinearSolver::Options options; + ContextImpl context; + options.context = &context; options.dense_linear_algebra_library_type = ceres::CUDA; std::unique_ptr<DenseQR> dense_qr = CUDADenseQR::Create(options);
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc index 2d8c2da..f426df5 100644 --- a/internal/ceres/dense_cholesky.cc +++ b/internal/ceres/dense_cholesky.cc
@@ -36,6 +36,7 @@ #include <vector> #ifndef CERES_NO_CUDA +#include "ceres/context_impl.h" #include "cuda_runtime.h" #include "cusolverDn.h" #endif // CERES_NO_CUDA @@ -193,36 +194,17 @@ #ifndef CERES_NO_CUDA -bool CUDADenseCholesky::Init(std::string* message) { - if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) { - *message = "cuSolverDN::cusolverDnCreate failed."; +bool CUDADenseCholesky::Init(ContextImpl* context, std::string* message) { + if (!context->InitCUDA(message)) { return false; } - if (cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking) != - cudaSuccess) { - *message = "CUDA::cudaStreamCreateWithFlags failed."; - cusolverDnDestroy(cusolver_handle_); - return false; - } - if (cusolverDnSetStream(cusolver_handle_, stream_) != - CUSOLVER_STATUS_SUCCESS) { - *message = "cuSolverDN::cusolverDnSetStream failed."; - cudaStreamDestroy(stream_); - cusolverDnDestroy(cusolver_handle_); - return false; - } + cusolver_handle_ = context->cusolver_handle_; + stream_ = context->stream_; error_.Reserve(1); *message = "CUDADenseCholesky::Init Success."; return true; } -CUDADenseCholesky::~CUDADenseCholesky() { - if (cusolver_handle_ != nullptr) { - CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS); - CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess); - } -} - LinearSolverTerminationType CUDADenseCholesky::Factorize( int num_cols, double* lhs, std::string* message) { factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; @@ -326,7 +308,7 @@ auto cuda_dense_cholesky = std::unique_ptr<CUDADenseCholesky>(new CUDADenseCholesky()); std::string cuda_error; - if (cuda_dense_cholesky->Init(&cuda_error)) { + if (cuda_dense_cholesky->Init(options.context, &cuda_error)) { return cuda_dense_cholesky; } // Initialization failed, destroy the object (done automatically) and return a
diff --git a/internal/ceres/dense_cholesky.h b/internal/ceres/dense_cholesky.h index 49d780c..b40e69a 100644 --- a/internal/ceres/dense_cholesky.h +++ b/internal/ceres/dense_cholesky.h
@@ -44,6 +44,7 @@ #include "ceres/linear_solver.h" #include "glog/logging.h" #ifndef CERES_NO_CUDA +#include "ceres/context_impl.h" #include "cuda_runtime.h" #include "cusolverDn.h" #endif // CERES_NO_CUDA @@ -140,7 +141,6 @@ public: static std::unique_ptr<CUDADenseCholesky> Create( const LinearSolver::Options& options); - ~CUDADenseCholesky() override; CUDADenseCholesky(const CUDADenseCholesky&) = delete; CUDADenseCholesky& operator=(const CUDADenseCholesky&) = delete; LinearSolverTerminationType Factorize(int num_cols, @@ -152,11 +152,10 @@ private: CUDADenseCholesky() = default; - // Initializes the cuSolverDN context, creates an asynchronous stream, and - // associates the stream with cuSolverDN. Returns true iff initialization was - // successful, else it returns false and a human-readable error message is - // returned. - bool Init(std::string* message); + // Picks up the cuSolverDN and cuStream handles from the context. If + // the context is unable to initialize CUDA, returns false with a + // human-readable message indicating the reason. + bool Init(ContextImpl* context, std::string* message); // Handle to the cuSOLVER context. cusolverDnHandle_t cusolver_handle_ = nullptr;
diff --git a/internal/ceres/dense_cholesky_test.cc b/internal/ceres/dense_cholesky_test.cc index eb1c336..034206a 100644 --- a/internal/ceres/dense_cholesky_test.cc +++ b/internal/ceres/dense_cholesky_test.cc
@@ -65,6 +65,8 @@ using VectorType = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>; LinearSolver::Options options; + ContextImpl context; + options.context = &context; options.dense_linear_algebra_library_type = GetParam(); std::unique_ptr<DenseCholesky> dense_cholesky = DenseCholesky::Create(options);
diff --git a/internal/ceres/dense_qr.cc b/internal/ceres/dense_qr.cc index 77f04ad..ad9b64e 100644 --- a/internal/ceres/dense_qr.cc +++ b/internal/ceres/dense_qr.cc
@@ -34,6 +34,7 @@ #include <memory> #include <string> #ifndef CERES_NO_CUDA +#include "ceres/context_impl.h" #include "cusolverDn.h" #include "cublas_v2.h" #endif // CERES_NO_CUDA @@ -310,53 +311,18 @@ #ifndef CERES_NO_CUDA -bool CUDADenseQR::Init(std::string* message) { - if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { - *message = "cuBLAS::cublasCreate failed."; +bool CUDADenseQR::Init(ContextImpl* context, std::string* message) { + if (!context->InitCUDA(message)) { return false; } - if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) { - *message = "cuSolverDN::cusolverDnCreate failed."; - return false; - } - if (cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking) != - cudaSuccess) { - *message = "CUDA::cudaStreamCreateWithFlags failed."; - cusolverDnDestroy(cusolver_handle_); - cublasDestroy(cublas_handle_); - return false; - } - if (cusolverDnSetStream(cusolver_handle_, stream_) != CUSOLVER_STATUS_SUCCESS) { - *message = "cuSolverDN::cusolverDnSetStream failed."; - cusolverDnDestroy(cusolver_handle_); - cudaStreamDestroy(stream_); - cublasDestroy(cublas_handle_); - return false; - } - if (cublasSetStream(cublas_handle_, stream_) != CUBLAS_STATUS_SUCCESS) { - *message = "cuBLAS::cublasSetStream failed."; - cusolverDnDestroy(cusolver_handle_); - cublasDestroy(cublas_handle_); - cudaStreamDestroy(stream_); - return false; - } + cublas_handle_ = context->cublas_handle_; + cusolver_handle_ = context->cusolver_handle_; + stream_ = context->stream_; error_.Reserve(1); *message = "CUDADenseQR::Init Success."; return true; } -CUDADenseQR::~CUDADenseQR() { - if (cublas_handle_ != nullptr) { - CHECK_EQ(cublasDestroy(cublas_handle_), CUBLAS_STATUS_SUCCESS); - } - if (cusolver_handle_ != nullptr) { - CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS); - } - if (stream_ != nullptr) { - CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess); - } -} - LinearSolverTerminationType CUDADenseQR::Factorize( int num_rows, int num_cols, double* lhs, std::string* message) { factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; @@ -496,7 +462,7 @@ auto cuda_dense_qr = std::unique_ptr<CUDADenseQR>(new CUDADenseQR()); std::string cuda_error; - if (cuda_dense_qr->Init(&cuda_error)) { + if (cuda_dense_qr->Init(options.context, &cuda_error)) { return cuda_dense_qr; } // Initialization failed, destroy the object (done automatically) and return a
diff --git a/internal/ceres/dense_qr.h b/internal/ceres/dense_qr.h index 8bcccf0..d42cf8c 100644 --- a/internal/ceres/dense_qr.h +++ b/internal/ceres/dense_qr.h
@@ -45,7 +45,9 @@ #include "ceres/internal/export.h" #include "ceres/linear_solver.h" #include "glog/logging.h" + #ifndef CERES_NO_CUDA +#include "ceres/context_impl.h" #include "ceres/cuda_buffer.h" #include "cuda_runtime.h" #include "cublas_v2.h" @@ -153,7 +155,6 @@ public: static std::unique_ptr<CUDADenseQR> Create( const LinearSolver::Options& options); - ~CUDADenseQR() override; CUDADenseQR(const CUDADenseQR&) = delete; CUDADenseQR& operator=(const CUDADenseQR&) = delete; LinearSolverTerminationType Factorize(int num_rows, @@ -166,11 +167,10 @@ private: CUDADenseQR(); - // Initializes the cuSolverDN context, creates an asynchronous stream, and - // associates the stream with cuSolverDN. Returns true iff initialization was - // successful, else it returns false and a human-readable error message is - // returned. - bool Init(std::string* message); + // Picks up the cuSolverDN, cuBLAS, and cuStream handles from the context. If + // the context is unable to initialize CUDA, returns false with a + // human-readable message indicating the reason. + bool Init(ContextImpl* context,std::string* message); // Handle to the cuSOLVER context. cusolverDnHandle_t cusolver_handle_ = nullptr;
diff --git a/internal/ceres/dense_qr_test.cc b/internal/ceres/dense_qr_test.cc index d9d307e..f796186 100644 --- a/internal/ceres/dense_qr_test.cc +++ b/internal/ceres/dense_qr_test.cc
@@ -67,6 +67,8 @@ using VectorType = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>; LinearSolver::Options options; + ContextImpl context; + options.context = &context; options.dense_linear_algebra_library_type = GetParam(); const double kEpsilon = std::numeric_limits<double>::epsilon() * 1.5e4; std::unique_ptr<DenseQR> dense_qr = DenseQR::Create(options);