Add support for dense CUDA solvers #1 1. Add CUDADenseCholesky64Bit, CUDADenseCholesky32Bit, & tests. CUDADenseCholesky32Bit uses the legacy versions of potrf/potrs in cuSolverDN, while CUDADenseCholesky64Bit uses the new 64-bit versions available since Cuda 11.1. The legacy versions are provided since some platforms such as the Nvidia Jetsons only support Cuda 10.2. 2. Expose CUDA as a new option under DenseLinearAlgebraLibraryType. The relevant option to string and string to option helper functions are modified accordingly. 3. Add cuda as a dense_linear_algebra_library option in bundle_adjuster to demonstrate the use of the new CUDA option. Change-Id: I23615e1d301df5185ed646b3e33ee802508dae86
diff --git a/CMakeLists.txt b/CMakeLists.txt index 68d99c9..5254b73 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt
@@ -154,6 +154,7 @@ option(ACCELERATESPARSE "Enable use of sparse solvers in Apple's Accelerate framework." ON) endif() +option(CUDA "Enable use of CUDA linear algebra solvers." ON) option(LAPACK "Enable use of LAPACK directly within Ceres." ON) # Template specializations for the Schur complement based solvers. If # compile time, binary size or compiler performance is an issue, you @@ -260,6 +261,31 @@ endif (EIGENSPARSE) endif (Eigen3_FOUND) +if (CUDA) + find_package(CUDA QUIET) + if (CUDA_FOUND) + message("-- Found CUDA version ${CUDA_VERSION}: " + "${CUDA_LIBRARIES};" + "${CUDA_cusolver_LIBRARY};" + "${CUDA_cusparse_LIBRARY}") + if (CUDA_VERSION VERSION_LESS 11.1) + message("-- CUDA version ${CUDA_VERSION} is older than 11.1, " + "using 32-bit API.") + add_definitions(-DCERES_CUDA_NO_64BIT_SOLVER_API) + else() + message("-- CUDA version ${CUDA_VERSION} is at least 11.1, " + "using 64-bit API.") + endif() + else (CUDA_FOUND) + message("-- Did not find CUDA library, disabling CUDA support.") + update_cache_variable(CUDA OFF) + list(APPEND CERES_COMPILE_OPTIONS CERES_NO_CUDA) + endif (CUDA_FOUND) +else (CUDA) + message("-- Building without CUDA.") + list(APPEND CERES_COMPILE_OPTIONS CERES_NO_CUDA) +endif (CUDA) + if (LAPACK) find_package(LAPACK QUIET) if (LAPACK_FOUND)
diff --git a/cmake/config.h.in b/cmake/config.h.in index 4a516f6..7022f67 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in
@@ -53,6 +53,9 @@ // If defined, Ceres was compiled without CXSparse. @CERES_NO_CXSPARSE@ +// If defined, Ceres was compiled without CUDA. +@CERES_NO_CUDA@ + // If defined, Ceres was compiled without Apple's Accelerate framework solvers. @CERES_NO_ACCELERATE_SPARSE@
diff --git a/examples/bundle_adjuster.cc b/examples/bundle_adjuster.cc index ef2dbc4..1940cf2 100644 --- a/examples/bundle_adjuster.cc +++ b/examples/bundle_adjuster.cc
@@ -93,7 +93,7 @@ DEFINE_string(sparse_linear_algebra_library, "suite_sparse", "Options are: suite_sparse and cx_sparse."); DEFINE_string(dense_linear_algebra_library, "eigen", - "Options are: eigen and lapack."); + "Options are: eigen, lapack, and cuda"); DEFINE_string(ordering, "automatic", "Options are: automatic, user."); DEFINE_bool(use_quaternions, false, "If true, uses quaternions to represent "
diff --git a/include/ceres/types.h b/include/ceres/types.h index 5ee6fdc..3a38805 100644 --- a/include/ceres/types.h +++ b/include/ceres/types.h
@@ -186,6 +186,7 @@ enum DenseLinearAlgebraLibraryType { EIGEN, LAPACK, + CUDA, }; // Logging options
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt index 80703a1..7a8bd41 100644 --- a/internal/ceres/CMakeLists.txt +++ b/internal/ceres/CMakeLists.txt
@@ -218,6 +218,14 @@ list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ${AccelerateSparse_LIBRARIES}) endif() +if (CUDA_FOUND) + list(APPEND + CERES_LIBRARY_PRIVATE_DEPENDENCIES + ${CUDA_LIBRARIES} + ${CUDA_cusolver_LIBRARY} + ${CUDA_cusparse_LIBRARY}) +endif (CUDA_FOUND) + if (LAPACK_FOUND) list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ${LAPACK_LIBRARIES}) endif () @@ -362,6 +370,10 @@ list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES_INCLUDE_DIRS ${AccelerateSparse_INCLUDE_DIRS}) endif() +if (CUDA) + list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES_INCLUDE_DIRS + ${CUDA_INCLUDE_DIRS}) +endif() # Add include locations for optional dependencies to the Ceres target without # duplication. list(REMOVE_DUPLICATES CERES_LIBRARY_PRIVATE_DEPENDENCIES_INCLUDE_DIRS) @@ -453,6 +465,7 @@ ceres_test(cost_function_to_functor) ceres_test(covariance) ceres_test(cubic_interpolation) + ceres_test(cuda_dense_cholesky) ceres_test(dense_linear_solver) ceres_test(dense_cholesky) ceres_test(dense_qr)
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h new file mode 100644 index 0000000..2d983af --- /dev/null +++ b/internal/ceres/cuda_buffer.h
@@ -0,0 +1,98 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2022 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Author: joydeepb@cs.utexas.edu (Joydeep Biswas) + +#ifndef CERES_INTERNAL_CUDA_BUFFER_H_ +#define CERES_INTERNAL_CUDA_BUFFER_H_ + +#include "ceres/internal/config.h" + +#ifndef CERES_NO_CUDA + +#include <vector> + +#include "cuda_runtime.h" +#include "glog/logging.h" + +// An encapsulated buffer to maintain GPU memory, and handle transfers between +// GPU and system memory. It is the responsibility of the user to ensure that +// the appropriate GPU device is selected before each subroutine is called. This +// is particularly important when using multiple GPU devices on different CPU +// threads, since active Cuda devices are determined by the cuda runtime on a +// per-thread basis. +template <typename T> +class CudaBuffer { + public: + CudaBuffer() {} + CudaBuffer(const CudaBuffer&) = delete; + CudaBuffer& operator=(const CudaBuffer&) = delete; + + ~CudaBuffer() { + if (data_ != nullptr) { + CHECK_EQ(cudaFree(data_), cudaSuccess); + } + } + + void Reserve(const size_t size) { + if (size > size_) { + if (data_ != nullptr) { + CHECK_EQ(cudaFree(data_), cudaSuccess); + } + CHECK_EQ(cudaMalloc(&data_, size * sizeof(T)), cudaSuccess); + size_ = size; + } + } + + void CopyToGpu(const T* data, const size_t size) { + Reserve(size); + CHECK_EQ(cudaMemcpy(data_, data, size * sizeof(T), cudaMemcpyHostToDevice), + cudaSuccess); + } + + void CopyToHost(T* data, const size_t size) { + CHECK(data_ != nullptr); + CHECK_EQ(cudaMemcpy(data, data_, size * sizeof(T), cudaMemcpyDeviceToHost), + cudaSuccess); + } + + void CopyToGpu(const std::vector<T>& data) { + CopyToGpu(data.data(), data.size()); + } + + T* data() { return data_; } + size_t size() const { return size_; } + + private: + T* data_ = nullptr; + size_t size_ = 0; +}; + +#endif // CERES_NO_CUDA + +#endif // CERES_INTERNAL_CUDA_BUFFER_H_ \ No newline at end of file
diff --git a/internal/ceres/cuda_dense_cholesky_test.cc b/internal/ceres/cuda_dense_cholesky_test.cc new file mode 100644 index 0000000..7d39b0e --- /dev/null +++ b/internal/ceres/cuda_dense_cholesky_test.cc
@@ -0,0 +1,207 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2022 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Author: joydeepb@cs.utexas.edu (Joydeep Biswas) + +#include <string> + +#include "ceres/dense_cholesky.h" +#include "ceres/internal/eigen.h" + +#include "glog/logging.h" +#include "gtest/gtest.h" + +using std::string; + +namespace ceres { +namespace internal { + +#ifndef CERES_NO_CUDA + +TEST(DenseCholeskyCudaSolverOld, InvalidOptionOnCreate) { + LinearSolver::Options options; + auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options); + EXPECT_EQ(dense_cuda_solver, nullptr); +} + +// Tests the CUDA Cholesky solver with a simple 4x4 matrix. +TEST(DenseCholeskyCudaSolverOld, Cholesky4x4Matrix) { + Eigen::Matrix4d A; + A << 4, 12, -16, 0, + 12, 37, -43, 0, + -16, -43, 98, 0, + 0, 0, 0, 1; + const Eigen::Vector4d b = Eigen::Vector4d::Ones(); + LinearSolver::Options options; + options.dense_linear_algebra_library_type = CUDA; + auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options); + ASSERT_NE(dense_cuda_solver, nullptr); + string error_string; + ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(), + A.data(), + &error_string), + LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS); + Eigen::Vector4d x = Eigen::Vector4d::Zero(); + ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string), + LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS); + EXPECT_NEAR(x(0), 113.75 / 3.0, std::numeric_limits<double>::epsilon() * 10); + EXPECT_NEAR(x(1), -31.0 / 3.0, std::numeric_limits<double>::epsilon() * 10); + EXPECT_NEAR(x(2), 5.0 / 3.0, std::numeric_limits<double>::epsilon() * 10); + EXPECT_NEAR(x(3), 1.0000, std::numeric_limits<double>::epsilon() * 10); +} + +TEST(DenseCholeskyCudaSolverOld, SingularMatrix) { + Eigen::Matrix3d A; + A << 1, 0, 0, + 0, 1, 0, + 0, 0, 0; + const Eigen::Vector3d b = Eigen::Vector3d::Ones(); + LinearSolver::Options options; + options.dense_linear_algebra_library_type = CUDA; + auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options); + ASSERT_NE(dense_cuda_solver, nullptr); + string error_string; + ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(), + A.data(), + &error_string), + LinearSolverTerminationType::LINEAR_SOLVER_FAILURE); +} + +TEST(DenseCholeskyCudaSolverOld, NegativeMatrix) { + Eigen::Matrix3d A; + A << 1, 0, 0, + 0, 1, 0, + 0, 0, -1; + const Eigen::Vector3d b = Eigen::Vector3d::Ones(); + LinearSolver::Options options; + options.dense_linear_algebra_library_type = CUDA; + auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options); + ASSERT_NE(dense_cuda_solver, nullptr); + string error_string; + ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(), + A.data(), + &error_string), + LinearSolverTerminationType::LINEAR_SOLVER_FAILURE); +} + +TEST(DenseCholeskyCudaSolverOld, MustFactorizeBeforeSolve) { + const Eigen::Vector3d b = Eigen::Vector3d::Ones(); + LinearSolver::Options options; + options.dense_linear_algebra_library_type = CUDA; + auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options); + ASSERT_NE(dense_cuda_solver, nullptr); + string error_string; + ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string), + LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR); +} + +#ifndef CERES_CUDA_NO_64BIT_SOLVER_API +// TODO(sameeragarwal): Convert these tests into templated tests so that we use +// the exact same code for both versions of the test. + +TEST(DenseCholeskyCudaSolverNew, InvalidOptionOnCreate) { + LinearSolver::Options options; + auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options); + EXPECT_EQ(dense_cuda_solver, nullptr); +} + +TEST(DenseCholeskyCudaSolverNew, Cholesky4x4Matrix) { + Eigen::Matrix4d A; + A << 4, 12, -16, 0, + 12, 37, -43, 0, + -16, -43, 98, 0, + 0, 0, 0, 1; + const Eigen::Vector4d b = Eigen::Vector4d::Ones(); + LinearSolver::Options options; + options.dense_linear_algebra_library_type = CUDA; + auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options); + ASSERT_NE(dense_cuda_solver, nullptr); + string error_string; + ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(), + A.data(), + &error_string), + LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS); + Eigen::Vector4d x = Eigen::Vector4d::Zero(); + ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string), + LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS); + EXPECT_NEAR(x(0), 113.75 / 3.0, std::numeric_limits<double>::epsilon() * 10); + EXPECT_NEAR(x(1), -31.0 / 3.0, std::numeric_limits<double>::epsilon() * 10); + EXPECT_NEAR(x(2), 5.0 / 3.0, std::numeric_limits<double>::epsilon() * 10); + EXPECT_NEAR(x(3), 1.0000, std::numeric_limits<double>::epsilon() * 10); +} + +TEST(DenseCholeskyCudaSolverNew, SingularMatrix) { + Eigen::Matrix3d A; + A << 1, 0, 0, + 0, 1, 0, + 0, 0, 0; + const Eigen::Vector3d b = Eigen::Vector3d::Ones(); + LinearSolver::Options options; + options.dense_linear_algebra_library_type = CUDA; + auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options); + ASSERT_NE(dense_cuda_solver, nullptr); + string error_string; + ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(), + A.data(), + &error_string), + LinearSolverTerminationType::LINEAR_SOLVER_FAILURE); +} + +TEST(DenseCholeskyCudaSolverNew, NegativeMatrix) { + Eigen::Matrix3d A; + A << 1, 0, 0, + 0, 1, 0, + 0, 0, -1; + const Eigen::Vector3d b = Eigen::Vector3d::Ones(); + LinearSolver::Options options; + options.dense_linear_algebra_library_type = CUDA; + auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options); + ASSERT_NE(dense_cuda_solver, nullptr); + string error_string; + ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(), + A.data(), + &error_string), + LinearSolverTerminationType::LINEAR_SOLVER_FAILURE); +} + +TEST(DenseCholeskyCudaSolverNew, MustFactorizeBeforeSolve) { + const Eigen::Vector3d b = Eigen::Vector3d::Ones(); + LinearSolver::Options options; + options.dense_linear_algebra_library_type = CUDA; + auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options); + ASSERT_NE(dense_cuda_solver, nullptr); + string error_string; + ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string), + LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR); +} +#endif // CERES_CUDA_NO_64BIT_SOLVER_API + +#endif // CERES_NO_CUDA + +} // namespace internal +} // namespace ceres
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc index f83fdf1..6070ea0 100644 --- a/internal/ceres/dense_cholesky.cc +++ b/internal/ceres/dense_cholesky.cc
@@ -33,6 +33,12 @@ #include <algorithm> #include <memory> #include <string> +#include <vector> + +#ifndef CERES_NO_CUDA +#include "cuda_runtime.h" +#include "cusolverDn.h" +#endif // CERES_NO_CUDA #ifndef CERES_NO_LAPACK @@ -70,6 +76,14 @@ LOG(FATAL) << "Ceres was compiled without support for LAPACK."; #endif + case CUDA: +#ifndef CERES_NO_CUDA + dense_cholesky = CUDADenseCholesky::Create(options); + break; +#else + LOG(FATAL) << "Ceres was compiled without support for CUDA."; +#endif + default: LOG(FATAL) << "Unknown dense linear algebra library type : " << DenseLinearAlgebraLibraryTypeToString( @@ -175,5 +189,362 @@ #endif // CERES_NO_LAPACK +#ifndef CERES_NO_CUDA + +bool CUDADenseCholesky32Bit::Init(std::string* message) { + if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnCreate failed."; + return false; + } + if (cudaStreamCreate(&stream_) != cudaSuccess) { + *message = "cuSolverDN::cudaStreamCreate failed."; + cusolverDnDestroy(cusolver_handle_); + return false; + } + if (cusolverDnSetStream(cusolver_handle_, stream_) != CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnSetStream failed."; + cudaStreamDestroy(stream_); + cusolverDnDestroy(cusolver_handle_); + return false; + } + error_.Reserve(1); + *message = "CUDADenseCholesky64Bit::Init Success."; + return true; +} + +CUDADenseCholesky32Bit::~CUDADenseCholesky32Bit() { + if (cusolver_handle_ != nullptr) { + CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS); + CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess); + } +} + +LinearSolverTerminationType CUDADenseCholesky32Bit::Factorize( + int num_cols, + double* lhs, + std::string* message) { + factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + // Allocate GPU memory if necessary. + lhs_.Reserve(num_cols * num_cols); + num_cols_ = num_cols; + // Copy A to GPU. + lhs_.CopyToGpu(lhs, num_cols * num_cols); + // Allocate scratch space on GPU. + int device_workspace_size = 0; + if (cusolverDnDpotrf_bufferSize(cusolver_handle_, + CUBLAS_FILL_MODE_LOWER, + num_cols, + lhs_.data(), + num_cols, + &device_workspace_size) != + CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnDpotrf_bufferSize failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + // Allocate GPU scratch memory. + device_workspace_.Reserve(device_workspace_size); + // Run the actual factorization (potrf) + if (cusolverDnDpotrf(cusolver_handle_, + CUBLAS_FILL_MODE_LOWER, + num_cols, + lhs_.data(), + num_cols, + reinterpret_cast<double*>(device_workspace_.data()), + device_workspace_.size(), + error_.data()) != CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnDpotrf failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + if (cudaDeviceSynchronize() != cudaSuccess || + cudaStreamSynchronize(stream_), cudaSuccess) { + *message = "Cuda device synchronization failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + int error = 0; + // Check for errors. + error_.CopyToHost(&error, 1); + if (error < 0) { + LOG(FATAL) << "Congratulations, you found a bug in Ceres - " + << "please report it. " + << "cuSolverDN::cusolverDnXpotrf fatal error." + << "Argument: " << -error << " is invalid."; + // The following line is unreachable, but return failure just to be + // pedantic, since the compiler does not know that. + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } else if (error > 0) { + *message = StringPrintf( + "cuSolverDN::cusolverDnDpotrf numerical failure. " + "The leading minor of order %d is not positive definite.", + error); + factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FAILURE; + return LinearSolverTerminationType::LINEAR_SOLVER_FAILURE; + } + *message = "Success"; + factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; + return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; +} + +LinearSolverTerminationType CUDADenseCholesky32Bit::Solve( + const double* rhs, + double* solution, + std::string* message) { + if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) { + *message = "Factorize did not complete succesfully previously."; + return factorize_result_; + } + // Copy RHS to GPU. + rhs_.CopyToGpu(rhs, num_cols_); + // Solve the system. + if (cusolverDnDpotrs(cusolver_handle_, + CUBLAS_FILL_MODE_LOWER, + num_cols_, + 1, + lhs_.data(), + num_cols_, + rhs_.data(), + num_cols_, + error_.data()) != CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnDpotrs failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + if (cudaDeviceSynchronize() != cudaSuccess || + cudaStreamSynchronize(stream_), cudaSuccess) { + *message = "Cuda device synchronization failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + // Check for errors. + int error = 0; + // Copy X from GPU to host. + rhs_.CopyToHost(solution, num_cols_); + if (error != 0) { + LOG(FATAL) << "Congratulations, you found a bug in Ceres." + << "Please report it." + << "cuSolverDN::cusolverDnDpotrs fatal error." + << "Argument: " << -error << " is invalid."; + } + // Copy error variable from GPU to host. + error_.CopyToHost(&error, 1); + *message = "Success"; + return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; +} + +std::unique_ptr<CUDADenseCholesky32Bit> CUDADenseCholesky32Bit::Create( + const LinearSolver::Options& options) { + if (options.dense_linear_algebra_library_type != CUDA) { + // The user called the wrong factory method. + return nullptr; + } + CUDADenseCholesky32Bit* cuda_dense_cholesky = new CUDADenseCholesky32Bit(); + std::string cuda_error; + if (cuda_dense_cholesky->Init(&cuda_error)) { + return std::unique_ptr<CUDADenseCholesky32Bit>(cuda_dense_cholesky); + } + // Initialization failed, destroy the object and return a nullptr. + delete cuda_dense_cholesky; + LOG(ERROR) << "CUDADenseCholesky32Bit::Init failed: " << cuda_error; + return std::unique_ptr<CUDADenseCholesky32Bit>(nullptr); +} + +std::unique_ptr<CUDADenseCholesky64Bit> CUDADenseCholesky64Bit::Create( + const LinearSolver::Options& options) { + if (options.dense_linear_algebra_library_type != CUDA) { + // The user called the wrong factory method. + return nullptr; + } + CUDADenseCholesky64Bit* cuda_dense_cholesky = new CUDADenseCholesky64Bit(); + std::string cuda_error; + if (cuda_dense_cholesky->Init(&cuda_error)) { + return std::unique_ptr<CUDADenseCholesky64Bit>(cuda_dense_cholesky); + } + // Initialization failed, destroy the object and return a nullptr. + delete cuda_dense_cholesky; + LOG(ERROR) << "CUDADenseCholesky64Bit::Init failed: " << cuda_error; + return std::unique_ptr<CUDADenseCholesky64Bit>(nullptr); +} + +#ifdef CERES_CUDA_NO_64BIT_SOLVER_API +// CUDA < 11.1 did not have the 64-bit APIs, so the implementation of the new +// interface will just generate a fatal error. + +CUDADenseCholesky64Bit::~CUDADenseCholesky64Bit() {} + +bool CUDADenseCholesky64Bit::Init(std::string* message) { + *message = "Cannot use CUDADenseCholesky64Bit with CUDA < 11.1."; + return false; +} + +LinearSolverTerminationType CUDADenseCholesky64Bit::Factorize( + int, + double*, + std::string*) { + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; +} + +LinearSolverTerminationType CUDADenseCholesky64Bit::Solve( + const double*, + double*, + std::string*) { + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; +} + +#else // CERES_CUDA_NO_64BIT_SOLVER_API + +bool CUDADenseCholesky64Bit::Init(std::string* message) { + if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnCreate failed."; + return false; + } + if (cudaStreamCreate(&stream_) != cudaSuccess) { + *message = "cuSolverDN::cudaStreamCreate failed."; + cusolverDnDestroy(cusolver_handle_); + return false; + } + if (cusolverDnSetStream(cusolver_handle_, stream_) != CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnSetStream failed."; + cudaStreamDestroy(stream_); + cusolverDnDestroy(cusolver_handle_); + return false; + } + error_.Reserve(1); + *message = "CUDADenseCholesky64Bit::Init Success."; + return true; +} + +CUDADenseCholesky64Bit::~CUDADenseCholesky64Bit() { + if (cusolver_handle_ != nullptr) { + CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS); + CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess); + } +} + +LinearSolverTerminationType CUDADenseCholesky64Bit::Factorize( + int num_cols, + double* lhs, + std::string* message) { + factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + // Allocate GPU memory if necessary. + lhs_.Reserve(num_cols * num_cols); + num_cols_ = num_cols; + // Copy A to GPU. + lhs_.CopyToGpu(lhs, num_cols * num_cols); + // Allocate scratch space on GPU. + size_t host_workspace_size = 0; + size_t device_workspace_size = 0; + if (cusolverDnXpotrf_bufferSize(cusolver_handle_, + nullptr, + CUBLAS_FILL_MODE_LOWER, + num_cols, + CUDA_R_64F, + lhs_.data(), + num_cols, + CUDA_R_64F, + &device_workspace_size, + &host_workspace_size) != + CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnXpotrf_bufferSize failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + // Allocate host scratch memory. + host_workspace_.resize(host_workspace_size); + // Allocate GPU scratch memory. + device_workspace_.Reserve(device_workspace_size); + // Run the actual factorization (potrf) + if (cusolverDnXpotrf(cusolver_handle_, + nullptr, + CUBLAS_FILL_MODE_LOWER, + num_cols, + CUDA_R_64F, + lhs_.data(), + num_cols, + CUDA_R_64F, + device_workspace_.data(), + device_workspace_.size(), + host_workspace_.data(), + host_workspace_.size(), + error_.data()) != CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnXpotrf failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + if (cudaDeviceSynchronize() != cudaSuccess || + cudaStreamSynchronize(stream_) != cudaSuccess) { + *message = "Cuda device synchronization failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + int error = 0; + // Check for errors. + error_.CopyToHost(&error, 1); + if (error < 0) { + LOG(FATAL) << "Congratulations, you found a bug in Ceres - " + << "please report it. " + << "cuSolverDN::cusolverDnXpotrf fatal error." + << "Argument: " << -error << " is invalid."; + // The following line is unreachable, but return failure just to be + // pedantic, since the compiler does not know that. + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } else if (error > 0) { + *message = StringPrintf( + "cuSolverDN::cusolverDnXpotrf numerical failure. " + "The leading minor of order %d is not positive definite.", + error); + factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FAILURE; + return LinearSolverTerminationType::LINEAR_SOLVER_FAILURE; + } + + *message = "Success"; + factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; + return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; +} + +LinearSolverTerminationType CUDADenseCholesky64Bit::Solve( + const double* rhs, + double* solution, + std::string* message) { + if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) { + *message = "Factorize did not complete succesfully previously."; + return factorize_result_; + } + // Copy RHS to GPU. + rhs_.CopyToGpu(rhs, num_cols_); + // Solve the system. + if (cusolverDnXpotrs(cusolver_handle_, + nullptr, + CUBLAS_FILL_MODE_LOWER, + num_cols_, + 1, + CUDA_R_64F, + lhs_.data(), + num_cols_, + CUDA_R_64F, + rhs_.data(), + num_cols_, + error_.data()) != CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnXpotrs failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + if (cudaDeviceSynchronize() != cudaSuccess || + cudaStreamSynchronize(stream_) != cudaSuccess) { + *message = "Cuda device synchronization failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + // Check for errors. + int error = 0; + // Copy error variable from GPU to host. + error_.CopyToHost(&error, 1); + if (error != 0) { + LOG(FATAL) << "Congratulations, you found a bug in Ceres." + << "Please report it." + << "cuSolverDN::cusolverDnXpotrs fatal error." + << "Argument: " << -error << " is invalid."; + } + // Copy X from GPU to host. + rhs_.CopyToHost(solution, num_cols_); + *message = "Success"; + return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; +} + +#endif // CERES_CUDA_NO_64BIT_SOLVER_API + +#endif // CERES_NO_CUDA + } // namespace internal } // namespace ceres
diff --git a/internal/ceres/dense_cholesky.h b/internal/ceres/dense_cholesky.h index 268862d..470998a 100644 --- a/internal/ceres/dense_cholesky.h +++ b/internal/ceres/dense_cholesky.h
@@ -37,10 +37,16 @@ // clang-format on #include <memory> +#include <vector> #include "Eigen/Dense" +#include "ceres/cuda_buffer.h" #include "ceres/linear_solver.h" #include "glog/logging.h" +#ifndef CERES_NO_CUDA +#include "cuda_runtime.h" +#include "cusolverDn.h" +#endif // CERES_NO_CUDA namespace ceres { namespace internal { @@ -129,6 +135,107 @@ }; #endif // CERES_NO_LAPACK +#ifndef CERES_NO_CUDA +// Implementation of DenseCholesky using the cuSolver library v.11.0 or older, +// using the legacy cuSolverDn interface. +class CERES_EXPORT_INTERNAL CUDADenseCholesky32Bit : public DenseCholesky { + public: + static std::unique_ptr<CUDADenseCholesky32Bit> Create( + const LinearSolver::Options& options); + ~CUDADenseCholesky32Bit() override; + CUDADenseCholesky32Bit(const CUDADenseCholesky32Bit&) = delete; + CUDADenseCholesky32Bit& operator=(const CUDADenseCholesky32Bit&) = delete; + LinearSolverTerminationType Factorize(int num_cols, + double* lhs, + std::string* message) override; + LinearSolverTerminationType Solve(const double* rhs, + double* solution, + std::string* message) override; + + private: + CUDADenseCholesky32Bit() {} + // Initializes the cuSolverDN context, creates an asynchronous stream, and + // associates the stream with cuSolverDN. Returns true iff initialization was + // successful, else it returns false and a human-readable error message is + // returned. + bool Init(std::string* message); + + // Handle to the cuSOLVER context. + cusolverDnHandle_t cusolver_handle_ = nullptr; + // CUDA device stream. + cudaStream_t stream_ = nullptr; + // Number of columns in the A matrix, to be cached between calls to *Factorize + // and *Solve. + size_t num_cols_ = 0; + // GPU memory allocated for the A matrix (lhs matrix). + CudaBuffer<double> lhs_; + // GPU memory allocated for the B matrix (rhs vector). + CudaBuffer<double> rhs_; + // Scratch space for cuSOLVER on the GPU. + CudaBuffer<uint8_t> device_workspace_; + // Required for error handling with cuSOLVER. + CudaBuffer<int> error_; + // Cache the result of Factorize to ensure that when Solve is called, the + // factiorization of lhs is valid. + LinearSolverTerminationType factorize_result_ = + LINEAR_SOLVER_FATAL_ERROR; +}; + +// Implementation of DenseCholesky using the cuSolver library v.11.1 or newer, +// using the 64-bit cuSolverDn interface. +class CERES_EXPORT_INTERNAL CUDADenseCholesky64Bit : public DenseCholesky { + public: + static std::unique_ptr<CUDADenseCholesky64Bit> Create( + const LinearSolver::Options& options); + ~CUDADenseCholesky64Bit() override; + CUDADenseCholesky64Bit(const CUDADenseCholesky64Bit&) = delete; + CUDADenseCholesky64Bit& operator=(const CUDADenseCholesky64Bit&) = delete; + LinearSolverTerminationType Factorize(int num_cols, + double* lhs, + std::string* message) override; + LinearSolverTerminationType Solve(const double* rhs, + double* solution, + std::string* message) override; + + private: + CUDADenseCholesky64Bit() {} + // Initializes the cuSolverDN context, creates an asynchronous stream, and + // associates the stream with cuSolverDN. Returns true iff initialization was + // successful, else it returns false and a human-readable error message is + // returned. + bool Init(std::string* message); + + // Handle to the cuSOLVER context. + cusolverDnHandle_t cusolver_handle_ = nullptr; + // CUDA device stream. + cudaStream_t stream_ = nullptr; + // Number of columns in the A matrix, to be cached between calls to Factorize + // and Solve. + size_t num_cols_ = 0; + // GPU memory allocated for the A matrix (lhs matrix). + CudaBuffer<double> lhs_; + // GPU memory allocated for the B matrix (rhs vector). + CudaBuffer<double> rhs_; + // Workspace for cuSOLVER on the GPU. + CudaBuffer<uint8_t> device_workspace_; + // Workspace for cuSOLVER on the host. + std::vector<double> host_workspace_; + // Required for error handling with cuSOLVER. + CudaBuffer<int> error_; + // Cache the result of Factorize to ensure that when Solve is called, the + // factiorization of lhs is valid. + LinearSolverTerminationType factorize_result_ = + LINEAR_SOLVER_FATAL_ERROR; +}; + +#ifdef CERES_CUDA_NO_64BIT_SOLVER_API +using CUDADenseCholesky = CUDADenseCholesky32Bit; +#else +using CUDADenseCholesky = CUDADenseCholesky64Bit; +#endif + +#endif // CERES_NO_CUDA + } // namespace internal } // namespace ceres
diff --git a/internal/ceres/dense_cholesky_test.cc b/internal/ceres/dense_cholesky_test.cc index eda4f10..4141e79 100644 --- a/internal/ceres/dense_cholesky_test.cc +++ b/internal/ceres/dense_cholesky_test.cc
@@ -92,17 +92,20 @@ } } +INSTANTIATE_TEST_SUITE_P( + _, + DenseCholeskyTest, + ::testing::Values( + EIGEN, #ifndef CERES_NO_LAPACK -INSTANTIATE_TEST_SUITE_P(_, - DenseCholeskyTest, - ::testing::Values(EIGEN, LAPACK), - ParamInfoToString); -#else -INSTANTIATE_TEST_SUITE_P(_, - DenseCholeskyTest, - ::testing::Values(EIGEN), - ParamInfoToString); + LAPACK, #endif +#ifndef CERES_NO_CUDA + CUDA +#endif + ), + ParamInfoToString); + } // namespace internal } // namespace ceres
diff --git a/internal/ceres/dense_linear_solver_benchmark.cc b/internal/ceres/dense_linear_solver_benchmark.cc index b8f1e00..2af9fa8 100644 --- a/internal/ceres/dense_linear_solver_benchmark.cc +++ b/internal/ceres/dense_linear_solver_benchmark.cc
@@ -95,7 +95,12 @@ ->Apply(MatrixSizes); BENCHMARK_TEMPLATE2(BM_DenseSolver, ceres::LAPACK, ceres::DENSE_NORMAL_CHOLESKY) ->Apply(MatrixSizes); -#endif +#endif // CERES_NO_LAPACK + +#ifndef CERES_NO_CUDA +BENCHMARK(BM_DenseSolver<ceres::CUDA, ceres::DENSE_NORMAL_CHOLESKY>) + ->Apply(MatrixSizes); +#endif // CERES_NO_CUDA } // namespace internal } // namespace ceres
diff --git a/internal/ceres/solver_utils.cc b/internal/ceres/solver_utils.cc index eb5aafa..1622e0f 100644 --- a/internal/ceres/solver_utils.cc +++ b/internal/ceres/solver_utils.cc
@@ -36,6 +36,9 @@ #include "ceres/internal/config.h" #include "ceres/internal/port.h" #include "ceres/version.h" +#ifndef CERES_NO_CUDA +#include "cuda_runtime.h" +#endif // CERES_NO_CUDA namespace ceres { namespace internal { @@ -87,6 +90,10 @@ value += "-no_custom_blas"; #endif +#ifndef CERES_NO_CUDA + value += "-cuda-(" + std::to_string(CUDART_VERSION) + ")"; +#endif + return value; }
diff --git a/internal/ceres/types.cc b/internal/ceres/types.cc index 39bb2d8..bf99568 100644 --- a/internal/ceres/types.cc +++ b/internal/ceres/types.cc
@@ -128,6 +128,7 @@ switch (type) { CASESTR(EIGEN); CASESTR(LAPACK); + CASESTR(CUDA); default: return "UNKNOWN"; } @@ -138,6 +139,7 @@ UpperCase(&value); STRENUM(EIGEN); STRENUM(LAPACK); + STRENUM(CUDA); return false; }