Add support for dense CUDA solvers #1
1. Add CUDADenseCholesky64Bit, CUDADenseCholesky32Bit, & tests.
CUDADenseCholesky32Bit uses the legacy versions of potrf/potrs
in cuSolverDN, while CUDADenseCholesky64Bit uses the new 64-bit
versions available since Cuda 11.1. The legacy versions are
provided since some platforms such as the Nvidia Jetsons only
support Cuda 10.2.
2. Expose CUDA as a new option under DenseLinearAlgebraLibraryType.
The relevant option to string and string to option helper functions
are modified accordingly.
3. Add cuda as a dense_linear_algebra_library option in bundle_adjuster
to demonstrate the use of the new CUDA option.
Change-Id: I23615e1d301df5185ed646b3e33ee802508dae86
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 68d99c9..5254b73 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -154,6 +154,7 @@
option(ACCELERATESPARSE
"Enable use of sparse solvers in Apple's Accelerate framework." ON)
endif()
+option(CUDA "Enable use of CUDA linear algebra solvers." ON)
option(LAPACK "Enable use of LAPACK directly within Ceres." ON)
# Template specializations for the Schur complement based solvers. If
# compile time, binary size or compiler performance is an issue, you
@@ -260,6 +261,31 @@
endif (EIGENSPARSE)
endif (Eigen3_FOUND)
+if (CUDA)
+ find_package(CUDA QUIET)
+ if (CUDA_FOUND)
+ message("-- Found CUDA version ${CUDA_VERSION}: "
+ "${CUDA_LIBRARIES};"
+ "${CUDA_cusolver_LIBRARY};"
+ "${CUDA_cusparse_LIBRARY}")
+ if (CUDA_VERSION VERSION_LESS 11.1)
+ message("-- CUDA version ${CUDA_VERSION} is older than 11.1, "
+ "using 32-bit API.")
+ add_definitions(-DCERES_CUDA_NO_64BIT_SOLVER_API)
+ else()
+ message("-- CUDA version ${CUDA_VERSION} is at least 11.1, "
+ "using 64-bit API.")
+ endif()
+ else (CUDA_FOUND)
+ message("-- Did not find CUDA library, disabling CUDA support.")
+ update_cache_variable(CUDA OFF)
+ list(APPEND CERES_COMPILE_OPTIONS CERES_NO_CUDA)
+ endif (CUDA_FOUND)
+else (CUDA)
+ message("-- Building without CUDA.")
+ list(APPEND CERES_COMPILE_OPTIONS CERES_NO_CUDA)
+endif (CUDA)
+
if (LAPACK)
find_package(LAPACK QUIET)
if (LAPACK_FOUND)
diff --git a/cmake/config.h.in b/cmake/config.h.in
index 4a516f6..7022f67 100644
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -53,6 +53,9 @@
// If defined, Ceres was compiled without CXSparse.
@CERES_NO_CXSPARSE@
+// If defined, Ceres was compiled without CUDA.
+@CERES_NO_CUDA@
+
// If defined, Ceres was compiled without Apple's Accelerate framework solvers.
@CERES_NO_ACCELERATE_SPARSE@
diff --git a/examples/bundle_adjuster.cc b/examples/bundle_adjuster.cc
index ef2dbc4..1940cf2 100644
--- a/examples/bundle_adjuster.cc
+++ b/examples/bundle_adjuster.cc
@@ -93,7 +93,7 @@
DEFINE_string(sparse_linear_algebra_library, "suite_sparse",
"Options are: suite_sparse and cx_sparse.");
DEFINE_string(dense_linear_algebra_library, "eigen",
- "Options are: eigen and lapack.");
+ "Options are: eigen, lapack, and cuda");
DEFINE_string(ordering, "automatic", "Options are: automatic, user.");
DEFINE_bool(use_quaternions, false, "If true, uses quaternions to represent "
diff --git a/include/ceres/types.h b/include/ceres/types.h
index 5ee6fdc..3a38805 100644
--- a/include/ceres/types.h
+++ b/include/ceres/types.h
@@ -186,6 +186,7 @@
enum DenseLinearAlgebraLibraryType {
EIGEN,
LAPACK,
+ CUDA,
};
// Logging options
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 80703a1..7a8bd41 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -218,6 +218,14 @@
list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ${AccelerateSparse_LIBRARIES})
endif()
+if (CUDA_FOUND)
+ list(APPEND
+ CERES_LIBRARY_PRIVATE_DEPENDENCIES
+ ${CUDA_LIBRARIES}
+ ${CUDA_cusolver_LIBRARY}
+ ${CUDA_cusparse_LIBRARY})
+endif (CUDA_FOUND)
+
if (LAPACK_FOUND)
list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ${LAPACK_LIBRARIES})
endif ()
@@ -362,6 +370,10 @@
list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES_INCLUDE_DIRS
${AccelerateSparse_INCLUDE_DIRS})
endif()
+if (CUDA)
+ list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES_INCLUDE_DIRS
+ ${CUDA_INCLUDE_DIRS})
+endif()
# Add include locations for optional dependencies to the Ceres target without
# duplication.
list(REMOVE_DUPLICATES CERES_LIBRARY_PRIVATE_DEPENDENCIES_INCLUDE_DIRS)
@@ -453,6 +465,7 @@
ceres_test(cost_function_to_functor)
ceres_test(covariance)
ceres_test(cubic_interpolation)
+ ceres_test(cuda_dense_cholesky)
ceres_test(dense_linear_solver)
ceres_test(dense_cholesky)
ceres_test(dense_qr)
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h
new file mode 100644
index 0000000..2d983af
--- /dev/null
+++ b/internal/ceres/cuda_buffer.h
@@ -0,0 +1,98 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#ifndef CERES_INTERNAL_CUDA_BUFFER_H_
+#define CERES_INTERNAL_CUDA_BUFFER_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include <vector>
+
+#include "cuda_runtime.h"
+#include "glog/logging.h"
+
+// An encapsulated buffer to maintain GPU memory, and handle transfers between
+// GPU and system memory. It is the responsibility of the user to ensure that
+// the appropriate GPU device is selected before each subroutine is called. This
+// is particularly important when using multiple GPU devices on different CPU
+// threads, since active Cuda devices are determined by the cuda runtime on a
+// per-thread basis.
+template <typename T>
+class CudaBuffer {
+ public:
+ CudaBuffer() {}
+ CudaBuffer(const CudaBuffer&) = delete;
+ CudaBuffer& operator=(const CudaBuffer&) = delete;
+
+ ~CudaBuffer() {
+ if (data_ != nullptr) {
+ CHECK_EQ(cudaFree(data_), cudaSuccess);
+ }
+ }
+
+ void Reserve(const size_t size) {
+ if (size > size_) {
+ if (data_ != nullptr) {
+ CHECK_EQ(cudaFree(data_), cudaSuccess);
+ }
+ CHECK_EQ(cudaMalloc(&data_, size * sizeof(T)), cudaSuccess);
+ size_ = size;
+ }
+ }
+
+ void CopyToGpu(const T* data, const size_t size) {
+ Reserve(size);
+ CHECK_EQ(cudaMemcpy(data_, data, size * sizeof(T), cudaMemcpyHostToDevice),
+ cudaSuccess);
+ }
+
+ void CopyToHost(T* data, const size_t size) {
+ CHECK(data_ != nullptr);
+ CHECK_EQ(cudaMemcpy(data, data_, size * sizeof(T), cudaMemcpyDeviceToHost),
+ cudaSuccess);
+ }
+
+ void CopyToGpu(const std::vector<T>& data) {
+ CopyToGpu(data.data(), data.size());
+ }
+
+ T* data() { return data_; }
+ size_t size() const { return size_; }
+
+ private:
+ T* data_ = nullptr;
+ size_t size_ = 0;
+};
+
+#endif // CERES_NO_CUDA
+
+#endif // CERES_INTERNAL_CUDA_BUFFER_H_
\ No newline at end of file
diff --git a/internal/ceres/cuda_dense_cholesky_test.cc b/internal/ceres/cuda_dense_cholesky_test.cc
new file mode 100644
index 0000000..7d39b0e
--- /dev/null
+++ b/internal/ceres/cuda_dense_cholesky_test.cc
@@ -0,0 +1,207 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include <string>
+
+#include "ceres/dense_cholesky.h"
+#include "ceres/internal/eigen.h"
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+using std::string;
+
+namespace ceres {
+namespace internal {
+
+#ifndef CERES_NO_CUDA
+
+TEST(DenseCholeskyCudaSolverOld, InvalidOptionOnCreate) {
+ LinearSolver::Options options;
+ auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+ EXPECT_EQ(dense_cuda_solver, nullptr);
+}
+
+// Tests the CUDA Cholesky solver with a simple 4x4 matrix.
+TEST(DenseCholeskyCudaSolverOld, Cholesky4x4Matrix) {
+ Eigen::Matrix4d A;
+ A << 4, 12, -16, 0,
+ 12, 37, -43, 0,
+ -16, -43, 98, 0,
+ 0, 0, 0, 1;
+ const Eigen::Vector4d b = Eigen::Vector4d::Ones();
+ LinearSolver::Options options;
+ options.dense_linear_algebra_library_type = CUDA;
+ auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+ ASSERT_NE(dense_cuda_solver, nullptr);
+ string error_string;
+ ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
+ A.data(),
+ &error_string),
+ LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
+ Eigen::Vector4d x = Eigen::Vector4d::Zero();
+ ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string),
+ LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
+ EXPECT_NEAR(x(0), 113.75 / 3.0, std::numeric_limits<double>::epsilon() * 10);
+ EXPECT_NEAR(x(1), -31.0 / 3.0, std::numeric_limits<double>::epsilon() * 10);
+ EXPECT_NEAR(x(2), 5.0 / 3.0, std::numeric_limits<double>::epsilon() * 10);
+ EXPECT_NEAR(x(3), 1.0000, std::numeric_limits<double>::epsilon() * 10);
+}
+
+TEST(DenseCholeskyCudaSolverOld, SingularMatrix) {
+ Eigen::Matrix3d A;
+ A << 1, 0, 0,
+ 0, 1, 0,
+ 0, 0, 0;
+ const Eigen::Vector3d b = Eigen::Vector3d::Ones();
+ LinearSolver::Options options;
+ options.dense_linear_algebra_library_type = CUDA;
+ auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+ ASSERT_NE(dense_cuda_solver, nullptr);
+ string error_string;
+ ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
+ A.data(),
+ &error_string),
+ LinearSolverTerminationType::LINEAR_SOLVER_FAILURE);
+}
+
+TEST(DenseCholeskyCudaSolverOld, NegativeMatrix) {
+ Eigen::Matrix3d A;
+ A << 1, 0, 0,
+ 0, 1, 0,
+ 0, 0, -1;
+ const Eigen::Vector3d b = Eigen::Vector3d::Ones();
+ LinearSolver::Options options;
+ options.dense_linear_algebra_library_type = CUDA;
+ auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+ ASSERT_NE(dense_cuda_solver, nullptr);
+ string error_string;
+ ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
+ A.data(),
+ &error_string),
+ LinearSolverTerminationType::LINEAR_SOLVER_FAILURE);
+}
+
+TEST(DenseCholeskyCudaSolverOld, MustFactorizeBeforeSolve) {
+ const Eigen::Vector3d b = Eigen::Vector3d::Ones();
+ LinearSolver::Options options;
+ options.dense_linear_algebra_library_type = CUDA;
+ auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+ ASSERT_NE(dense_cuda_solver, nullptr);
+ string error_string;
+ ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string),
+ LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR);
+}
+
+#ifndef CERES_CUDA_NO_64BIT_SOLVER_API
+// TODO(sameeragarwal): Convert these tests into templated tests so that we use
+// the exact same code for both versions of the test.
+
+TEST(DenseCholeskyCudaSolverNew, InvalidOptionOnCreate) {
+ LinearSolver::Options options;
+ auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
+ EXPECT_EQ(dense_cuda_solver, nullptr);
+}
+
+TEST(DenseCholeskyCudaSolverNew, Cholesky4x4Matrix) {
+ Eigen::Matrix4d A;
+ A << 4, 12, -16, 0,
+ 12, 37, -43, 0,
+ -16, -43, 98, 0,
+ 0, 0, 0, 1;
+ const Eigen::Vector4d b = Eigen::Vector4d::Ones();
+ LinearSolver::Options options;
+ options.dense_linear_algebra_library_type = CUDA;
+ auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
+ ASSERT_NE(dense_cuda_solver, nullptr);
+ string error_string;
+ ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
+ A.data(),
+ &error_string),
+ LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
+ Eigen::Vector4d x = Eigen::Vector4d::Zero();
+ ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string),
+ LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
+ EXPECT_NEAR(x(0), 113.75 / 3.0, std::numeric_limits<double>::epsilon() * 10);
+ EXPECT_NEAR(x(1), -31.0 / 3.0, std::numeric_limits<double>::epsilon() * 10);
+ EXPECT_NEAR(x(2), 5.0 / 3.0, std::numeric_limits<double>::epsilon() * 10);
+ EXPECT_NEAR(x(3), 1.0000, std::numeric_limits<double>::epsilon() * 10);
+}
+
+TEST(DenseCholeskyCudaSolverNew, SingularMatrix) {
+ Eigen::Matrix3d A;
+ A << 1, 0, 0,
+ 0, 1, 0,
+ 0, 0, 0;
+ const Eigen::Vector3d b = Eigen::Vector3d::Ones();
+ LinearSolver::Options options;
+ options.dense_linear_algebra_library_type = CUDA;
+ auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
+ ASSERT_NE(dense_cuda_solver, nullptr);
+ string error_string;
+ ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
+ A.data(),
+ &error_string),
+ LinearSolverTerminationType::LINEAR_SOLVER_FAILURE);
+}
+
+TEST(DenseCholeskyCudaSolverNew, NegativeMatrix) {
+ Eigen::Matrix3d A;
+ A << 1, 0, 0,
+ 0, 1, 0,
+ 0, 0, -1;
+ const Eigen::Vector3d b = Eigen::Vector3d::Ones();
+ LinearSolver::Options options;
+ options.dense_linear_algebra_library_type = CUDA;
+ auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
+ ASSERT_NE(dense_cuda_solver, nullptr);
+ string error_string;
+ ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
+ A.data(),
+ &error_string),
+ LinearSolverTerminationType::LINEAR_SOLVER_FAILURE);
+}
+
+TEST(DenseCholeskyCudaSolverNew, MustFactorizeBeforeSolve) {
+ const Eigen::Vector3d b = Eigen::Vector3d::Ones();
+ LinearSolver::Options options;
+ options.dense_linear_algebra_library_type = CUDA;
+ auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
+ ASSERT_NE(dense_cuda_solver, nullptr);
+ string error_string;
+ ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string),
+ LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR);
+}
+#endif // CERES_CUDA_NO_64BIT_SOLVER_API
+
+#endif // CERES_NO_CUDA
+
+} // namespace internal
+} // namespace ceres
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index f83fdf1..6070ea0 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc
@@ -33,6 +33,12 @@
#include <algorithm>
#include <memory>
#include <string>
+#include <vector>
+
+#ifndef CERES_NO_CUDA
+#include "cuda_runtime.h"
+#include "cusolverDn.h"
+#endif // CERES_NO_CUDA
#ifndef CERES_NO_LAPACK
@@ -70,6 +76,14 @@
LOG(FATAL) << "Ceres was compiled without support for LAPACK.";
#endif
+ case CUDA:
+#ifndef CERES_NO_CUDA
+ dense_cholesky = CUDADenseCholesky::Create(options);
+ break;
+#else
+ LOG(FATAL) << "Ceres was compiled without support for CUDA.";
+#endif
+
default:
LOG(FATAL) << "Unknown dense linear algebra library type : "
<< DenseLinearAlgebraLibraryTypeToString(
@@ -175,5 +189,362 @@
#endif // CERES_NO_LAPACK
+#ifndef CERES_NO_CUDA
+
+bool CUDADenseCholesky32Bit::Init(std::string* message) {
+ if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnCreate failed.";
+ return false;
+ }
+ if (cudaStreamCreate(&stream_) != cudaSuccess) {
+ *message = "cuSolverDN::cudaStreamCreate failed.";
+ cusolverDnDestroy(cusolver_handle_);
+ return false;
+ }
+ if (cusolverDnSetStream(cusolver_handle_, stream_) != CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnSetStream failed.";
+ cudaStreamDestroy(stream_);
+ cusolverDnDestroy(cusolver_handle_);
+ return false;
+ }
+ error_.Reserve(1);
+ *message = "CUDADenseCholesky64Bit::Init Success.";
+ return true;
+}
+
+CUDADenseCholesky32Bit::~CUDADenseCholesky32Bit() {
+ if (cusolver_handle_ != nullptr) {
+ CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS);
+ CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess);
+ }
+}
+
+LinearSolverTerminationType CUDADenseCholesky32Bit::Factorize(
+ int num_cols,
+ double* lhs,
+ std::string* message) {
+ factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ // Allocate GPU memory if necessary.
+ lhs_.Reserve(num_cols * num_cols);
+ num_cols_ = num_cols;
+ // Copy A to GPU.
+ lhs_.CopyToGpu(lhs, num_cols * num_cols);
+ // Allocate scratch space on GPU.
+ int device_workspace_size = 0;
+ if (cusolverDnDpotrf_bufferSize(cusolver_handle_,
+ CUBLAS_FILL_MODE_LOWER,
+ num_cols,
+ lhs_.data(),
+ num_cols,
+ &device_workspace_size) !=
+ CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnDpotrf_bufferSize failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ // Allocate GPU scratch memory.
+ device_workspace_.Reserve(device_workspace_size);
+ // Run the actual factorization (potrf)
+ if (cusolverDnDpotrf(cusolver_handle_,
+ CUBLAS_FILL_MODE_LOWER,
+ num_cols,
+ lhs_.data(),
+ num_cols,
+ reinterpret_cast<double*>(device_workspace_.data()),
+ device_workspace_.size(),
+ error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnDpotrf failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ if (cudaDeviceSynchronize() != cudaSuccess ||
+ cudaStreamSynchronize(stream_), cudaSuccess) {
+ *message = "Cuda device synchronization failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ int error = 0;
+ // Check for errors.
+ error_.CopyToHost(&error, 1);
+ if (error < 0) {
+ LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
+ << "please report it. "
+ << "cuSolverDN::cusolverDnXpotrf fatal error."
+ << "Argument: " << -error << " is invalid.";
+ // The following line is unreachable, but return failure just to be
+ // pedantic, since the compiler does not know that.
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ } else if (error > 0) {
+ *message = StringPrintf(
+ "cuSolverDN::cusolverDnDpotrf numerical failure. "
+ "The leading minor of order %d is not positive definite.",
+ error);
+ factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FAILURE;
+ return LinearSolverTerminationType::LINEAR_SOLVER_FAILURE;
+ }
+ *message = "Success";
+ factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+ return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+}
+
+LinearSolverTerminationType CUDADenseCholesky32Bit::Solve(
+ const double* rhs,
+ double* solution,
+ std::string* message) {
+ if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) {
+ *message = "Factorize did not complete succesfully previously.";
+ return factorize_result_;
+ }
+ // Copy RHS to GPU.
+ rhs_.CopyToGpu(rhs, num_cols_);
+ // Solve the system.
+ if (cusolverDnDpotrs(cusolver_handle_,
+ CUBLAS_FILL_MODE_LOWER,
+ num_cols_,
+ 1,
+ lhs_.data(),
+ num_cols_,
+ rhs_.data(),
+ num_cols_,
+ error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnDpotrs failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ if (cudaDeviceSynchronize() != cudaSuccess ||
+ cudaStreamSynchronize(stream_), cudaSuccess) {
+ *message = "Cuda device synchronization failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ // Check for errors.
+ int error = 0;
+ // Copy X from GPU to host.
+ rhs_.CopyToHost(solution, num_cols_);
+ if (error != 0) {
+ LOG(FATAL) << "Congratulations, you found a bug in Ceres."
+ << "Please report it."
+ << "cuSolverDN::cusolverDnDpotrs fatal error."
+ << "Argument: " << -error << " is invalid.";
+ }
+ // Copy error variable from GPU to host.
+ error_.CopyToHost(&error, 1);
+ *message = "Success";
+ return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+}
+
+std::unique_ptr<CUDADenseCholesky32Bit> CUDADenseCholesky32Bit::Create(
+ const LinearSolver::Options& options) {
+ if (options.dense_linear_algebra_library_type != CUDA) {
+ // The user called the wrong factory method.
+ return nullptr;
+ }
+ CUDADenseCholesky32Bit* cuda_dense_cholesky = new CUDADenseCholesky32Bit();
+ std::string cuda_error;
+ if (cuda_dense_cholesky->Init(&cuda_error)) {
+ return std::unique_ptr<CUDADenseCholesky32Bit>(cuda_dense_cholesky);
+ }
+ // Initialization failed, destroy the object and return a nullptr.
+ delete cuda_dense_cholesky;
+ LOG(ERROR) << "CUDADenseCholesky32Bit::Init failed: " << cuda_error;
+ return std::unique_ptr<CUDADenseCholesky32Bit>(nullptr);
+}
+
+std::unique_ptr<CUDADenseCholesky64Bit> CUDADenseCholesky64Bit::Create(
+ const LinearSolver::Options& options) {
+ if (options.dense_linear_algebra_library_type != CUDA) {
+ // The user called the wrong factory method.
+ return nullptr;
+ }
+ CUDADenseCholesky64Bit* cuda_dense_cholesky = new CUDADenseCholesky64Bit();
+ std::string cuda_error;
+ if (cuda_dense_cholesky->Init(&cuda_error)) {
+ return std::unique_ptr<CUDADenseCholesky64Bit>(cuda_dense_cholesky);
+ }
+ // Initialization failed, destroy the object and return a nullptr.
+ delete cuda_dense_cholesky;
+ LOG(ERROR) << "CUDADenseCholesky64Bit::Init failed: " << cuda_error;
+ return std::unique_ptr<CUDADenseCholesky64Bit>(nullptr);
+}
+
+#ifdef CERES_CUDA_NO_64BIT_SOLVER_API
+// CUDA < 11.1 did not have the 64-bit APIs, so the implementation of the new
+// interface will just generate a fatal error.
+
+CUDADenseCholesky64Bit::~CUDADenseCholesky64Bit() {}
+
+bool CUDADenseCholesky64Bit::Init(std::string* message) {
+ *message = "Cannot use CUDADenseCholesky64Bit with CUDA < 11.1.";
+ return false;
+}
+
+LinearSolverTerminationType CUDADenseCholesky64Bit::Factorize(
+ int,
+ double*,
+ std::string*) {
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+}
+
+LinearSolverTerminationType CUDADenseCholesky64Bit::Solve(
+ const double*,
+ double*,
+ std::string*) {
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+}
+
+#else // CERES_CUDA_NO_64BIT_SOLVER_API
+
+bool CUDADenseCholesky64Bit::Init(std::string* message) {
+ if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnCreate failed.";
+ return false;
+ }
+ if (cudaStreamCreate(&stream_) != cudaSuccess) {
+ *message = "cuSolverDN::cudaStreamCreate failed.";
+ cusolverDnDestroy(cusolver_handle_);
+ return false;
+ }
+ if (cusolverDnSetStream(cusolver_handle_, stream_) != CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnSetStream failed.";
+ cudaStreamDestroy(stream_);
+ cusolverDnDestroy(cusolver_handle_);
+ return false;
+ }
+ error_.Reserve(1);
+ *message = "CUDADenseCholesky64Bit::Init Success.";
+ return true;
+}
+
+CUDADenseCholesky64Bit::~CUDADenseCholesky64Bit() {
+ if (cusolver_handle_ != nullptr) {
+ CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS);
+ CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess);
+ }
+}
+
+LinearSolverTerminationType CUDADenseCholesky64Bit::Factorize(
+ int num_cols,
+ double* lhs,
+ std::string* message) {
+ factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ // Allocate GPU memory if necessary.
+ lhs_.Reserve(num_cols * num_cols);
+ num_cols_ = num_cols;
+ // Copy A to GPU.
+ lhs_.CopyToGpu(lhs, num_cols * num_cols);
+ // Allocate scratch space on GPU.
+ size_t host_workspace_size = 0;
+ size_t device_workspace_size = 0;
+ if (cusolverDnXpotrf_bufferSize(cusolver_handle_,
+ nullptr,
+ CUBLAS_FILL_MODE_LOWER,
+ num_cols,
+ CUDA_R_64F,
+ lhs_.data(),
+ num_cols,
+ CUDA_R_64F,
+ &device_workspace_size,
+ &host_workspace_size) !=
+ CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnXpotrf_bufferSize failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ // Allocate host scratch memory.
+ host_workspace_.resize(host_workspace_size);
+ // Allocate GPU scratch memory.
+ device_workspace_.Reserve(device_workspace_size);
+ // Run the actual factorization (potrf)
+ if (cusolverDnXpotrf(cusolver_handle_,
+ nullptr,
+ CUBLAS_FILL_MODE_LOWER,
+ num_cols,
+ CUDA_R_64F,
+ lhs_.data(),
+ num_cols,
+ CUDA_R_64F,
+ device_workspace_.data(),
+ device_workspace_.size(),
+ host_workspace_.data(),
+ host_workspace_.size(),
+ error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnXpotrf failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ if (cudaDeviceSynchronize() != cudaSuccess ||
+ cudaStreamSynchronize(stream_) != cudaSuccess) {
+ *message = "Cuda device synchronization failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ int error = 0;
+ // Check for errors.
+ error_.CopyToHost(&error, 1);
+ if (error < 0) {
+ LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
+ << "please report it. "
+ << "cuSolverDN::cusolverDnXpotrf fatal error."
+ << "Argument: " << -error << " is invalid.";
+ // The following line is unreachable, but return failure just to be
+ // pedantic, since the compiler does not know that.
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ } else if (error > 0) {
+ *message = StringPrintf(
+ "cuSolverDN::cusolverDnXpotrf numerical failure. "
+ "The leading minor of order %d is not positive definite.",
+ error);
+ factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FAILURE;
+ return LinearSolverTerminationType::LINEAR_SOLVER_FAILURE;
+ }
+
+ *message = "Success";
+ factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+ return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+}
+
+LinearSolverTerminationType CUDADenseCholesky64Bit::Solve(
+ const double* rhs,
+ double* solution,
+ std::string* message) {
+ if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) {
+ *message = "Factorize did not complete succesfully previously.";
+ return factorize_result_;
+ }
+ // Copy RHS to GPU.
+ rhs_.CopyToGpu(rhs, num_cols_);
+ // Solve the system.
+ if (cusolverDnXpotrs(cusolver_handle_,
+ nullptr,
+ CUBLAS_FILL_MODE_LOWER,
+ num_cols_,
+ 1,
+ CUDA_R_64F,
+ lhs_.data(),
+ num_cols_,
+ CUDA_R_64F,
+ rhs_.data(),
+ num_cols_,
+ error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnXpotrs failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ if (cudaDeviceSynchronize() != cudaSuccess ||
+ cudaStreamSynchronize(stream_) != cudaSuccess) {
+ *message = "Cuda device synchronization failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ // Check for errors.
+ int error = 0;
+ // Copy error variable from GPU to host.
+ error_.CopyToHost(&error, 1);
+ if (error != 0) {
+ LOG(FATAL) << "Congratulations, you found a bug in Ceres."
+ << "Please report it."
+ << "cuSolverDN::cusolverDnXpotrs fatal error."
+ << "Argument: " << -error << " is invalid.";
+ }
+ // Copy X from GPU to host.
+ rhs_.CopyToHost(solution, num_cols_);
+ *message = "Success";
+ return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+}
+
+#endif // CERES_CUDA_NO_64BIT_SOLVER_API
+
+#endif // CERES_NO_CUDA
+
} // namespace internal
} // namespace ceres
diff --git a/internal/ceres/dense_cholesky.h b/internal/ceres/dense_cholesky.h
index 268862d..470998a 100644
--- a/internal/ceres/dense_cholesky.h
+++ b/internal/ceres/dense_cholesky.h
@@ -37,10 +37,16 @@
// clang-format on
#include <memory>
+#include <vector>
#include "Eigen/Dense"
+#include "ceres/cuda_buffer.h"
#include "ceres/linear_solver.h"
#include "glog/logging.h"
+#ifndef CERES_NO_CUDA
+#include "cuda_runtime.h"
+#include "cusolverDn.h"
+#endif // CERES_NO_CUDA
namespace ceres {
namespace internal {
@@ -129,6 +135,107 @@
};
#endif // CERES_NO_LAPACK
+#ifndef CERES_NO_CUDA
+// Implementation of DenseCholesky using the cuSolver library v.11.0 or older,
+// using the legacy cuSolverDn interface.
+class CERES_EXPORT_INTERNAL CUDADenseCholesky32Bit : public DenseCholesky {
+ public:
+ static std::unique_ptr<CUDADenseCholesky32Bit> Create(
+ const LinearSolver::Options& options);
+ ~CUDADenseCholesky32Bit() override;
+ CUDADenseCholesky32Bit(const CUDADenseCholesky32Bit&) = delete;
+ CUDADenseCholesky32Bit& operator=(const CUDADenseCholesky32Bit&) = delete;
+ LinearSolverTerminationType Factorize(int num_cols,
+ double* lhs,
+ std::string* message) override;
+ LinearSolverTerminationType Solve(const double* rhs,
+ double* solution,
+ std::string* message) override;
+
+ private:
+ CUDADenseCholesky32Bit() {}
+ // Initializes the cuSolverDN context, creates an asynchronous stream, and
+ // associates the stream with cuSolverDN. Returns true iff initialization was
+ // successful, else it returns false and a human-readable error message is
+ // returned.
+ bool Init(std::string* message);
+
+ // Handle to the cuSOLVER context.
+ cusolverDnHandle_t cusolver_handle_ = nullptr;
+ // CUDA device stream.
+ cudaStream_t stream_ = nullptr;
+ // Number of columns in the A matrix, to be cached between calls to *Factorize
+ // and *Solve.
+ size_t num_cols_ = 0;
+ // GPU memory allocated for the A matrix (lhs matrix).
+ CudaBuffer<double> lhs_;
+ // GPU memory allocated for the B matrix (rhs vector).
+ CudaBuffer<double> rhs_;
+ // Scratch space for cuSOLVER on the GPU.
+ CudaBuffer<uint8_t> device_workspace_;
+ // Required for error handling with cuSOLVER.
+ CudaBuffer<int> error_;
+ // Cache the result of Factorize to ensure that when Solve is called, the
+ // factiorization of lhs is valid.
+ LinearSolverTerminationType factorize_result_ =
+ LINEAR_SOLVER_FATAL_ERROR;
+};
+
+// Implementation of DenseCholesky using the cuSolver library v.11.1 or newer,
+// using the 64-bit cuSolverDn interface.
+class CERES_EXPORT_INTERNAL CUDADenseCholesky64Bit : public DenseCholesky {
+ public:
+ static std::unique_ptr<CUDADenseCholesky64Bit> Create(
+ const LinearSolver::Options& options);
+ ~CUDADenseCholesky64Bit() override;
+ CUDADenseCholesky64Bit(const CUDADenseCholesky64Bit&) = delete;
+ CUDADenseCholesky64Bit& operator=(const CUDADenseCholesky64Bit&) = delete;
+ LinearSolverTerminationType Factorize(int num_cols,
+ double* lhs,
+ std::string* message) override;
+ LinearSolverTerminationType Solve(const double* rhs,
+ double* solution,
+ std::string* message) override;
+
+ private:
+ CUDADenseCholesky64Bit() {}
+ // Initializes the cuSolverDN context, creates an asynchronous stream, and
+ // associates the stream with cuSolverDN. Returns true iff initialization was
+ // successful, else it returns false and a human-readable error message is
+ // returned.
+ bool Init(std::string* message);
+
+ // Handle to the cuSOLVER context.
+ cusolverDnHandle_t cusolver_handle_ = nullptr;
+ // CUDA device stream.
+ cudaStream_t stream_ = nullptr;
+ // Number of columns in the A matrix, to be cached between calls to Factorize
+ // and Solve.
+ size_t num_cols_ = 0;
+ // GPU memory allocated for the A matrix (lhs matrix).
+ CudaBuffer<double> lhs_;
+ // GPU memory allocated for the B matrix (rhs vector).
+ CudaBuffer<double> rhs_;
+ // Workspace for cuSOLVER on the GPU.
+ CudaBuffer<uint8_t> device_workspace_;
+ // Workspace for cuSOLVER on the host.
+ std::vector<double> host_workspace_;
+ // Required for error handling with cuSOLVER.
+ CudaBuffer<int> error_;
+ // Cache the result of Factorize to ensure that when Solve is called, the
+ // factiorization of lhs is valid.
+ LinearSolverTerminationType factorize_result_ =
+ LINEAR_SOLVER_FATAL_ERROR;
+};
+
+#ifdef CERES_CUDA_NO_64BIT_SOLVER_API
+using CUDADenseCholesky = CUDADenseCholesky32Bit;
+#else
+using CUDADenseCholesky = CUDADenseCholesky64Bit;
+#endif
+
+#endif // CERES_NO_CUDA
+
} // namespace internal
} // namespace ceres
diff --git a/internal/ceres/dense_cholesky_test.cc b/internal/ceres/dense_cholesky_test.cc
index eda4f10..4141e79 100644
--- a/internal/ceres/dense_cholesky_test.cc
+++ b/internal/ceres/dense_cholesky_test.cc
@@ -92,17 +92,20 @@
}
}
+INSTANTIATE_TEST_SUITE_P(
+ _,
+ DenseCholeskyTest,
+ ::testing::Values(
+ EIGEN,
#ifndef CERES_NO_LAPACK
-INSTANTIATE_TEST_SUITE_P(_,
- DenseCholeskyTest,
- ::testing::Values(EIGEN, LAPACK),
- ParamInfoToString);
-#else
-INSTANTIATE_TEST_SUITE_P(_,
- DenseCholeskyTest,
- ::testing::Values(EIGEN),
- ParamInfoToString);
+ LAPACK,
#endif
+#ifndef CERES_NO_CUDA
+ CUDA
+#endif
+ ),
+ ParamInfoToString);
+
} // namespace internal
} // namespace ceres
diff --git a/internal/ceres/dense_linear_solver_benchmark.cc b/internal/ceres/dense_linear_solver_benchmark.cc
index b8f1e00..2af9fa8 100644
--- a/internal/ceres/dense_linear_solver_benchmark.cc
+++ b/internal/ceres/dense_linear_solver_benchmark.cc
@@ -95,7 +95,12 @@
->Apply(MatrixSizes);
BENCHMARK_TEMPLATE2(BM_DenseSolver, ceres::LAPACK, ceres::DENSE_NORMAL_CHOLESKY)
->Apply(MatrixSizes);
-#endif
+#endif // CERES_NO_LAPACK
+
+#ifndef CERES_NO_CUDA
+BENCHMARK(BM_DenseSolver<ceres::CUDA, ceres::DENSE_NORMAL_CHOLESKY>)
+ ->Apply(MatrixSizes);
+#endif // CERES_NO_CUDA
} // namespace internal
} // namespace ceres
diff --git a/internal/ceres/solver_utils.cc b/internal/ceres/solver_utils.cc
index eb5aafa..1622e0f 100644
--- a/internal/ceres/solver_utils.cc
+++ b/internal/ceres/solver_utils.cc
@@ -36,6 +36,9 @@
#include "ceres/internal/config.h"
#include "ceres/internal/port.h"
#include "ceres/version.h"
+#ifndef CERES_NO_CUDA
+#include "cuda_runtime.h"
+#endif // CERES_NO_CUDA
namespace ceres {
namespace internal {
@@ -87,6 +90,10 @@
value += "-no_custom_blas";
#endif
+#ifndef CERES_NO_CUDA
+ value += "-cuda-(" + std::to_string(CUDART_VERSION) + ")";
+#endif
+
return value;
}
diff --git a/internal/ceres/types.cc b/internal/ceres/types.cc
index 39bb2d8..bf99568 100644
--- a/internal/ceres/types.cc
+++ b/internal/ceres/types.cc
@@ -128,6 +128,7 @@
switch (type) {
CASESTR(EIGEN);
CASESTR(LAPACK);
+ CASESTR(CUDA);
default:
return "UNKNOWN";
}
@@ -138,6 +139,7 @@
UpperCase(&value);
STRENUM(EIGEN);
STRENUM(LAPACK);
+ STRENUM(CUDA);
return false;
}