Add support for dense CUDA solvers #2 1. Add CUDADenseQR & tests. CUDADenseQR uses the cuSolverDN LAPACK implementation of QR factorization. A key limitation, however, is that this solver does not perform singularity checking -- this is because cuSolverDN does not have a trtrs implementation; we instead use cuBLAS' trsv for backsubstitution. 2. All CPU -> GPU memory transfers are now async, and both CUDADenseQR and CUDADenseCholesky explicitly manage their own streams for async operations. 3. Simplified CUDADenseCholesky to only use the legacy 32-bit cuSolverDN API. Change-Id: I2a9b7b65469658ddfe33b5b2a3892c8744d6e437
diff --git a/CMakeLists.txt b/CMakeLists.txt index 48ba201..69432f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt
@@ -270,14 +270,6 @@ "${CUDA_LIBRARIES};" "${CUDA_cusolver_LIBRARY};" "${CUDA_cusparse_LIBRARY}") - if (CUDA_VERSION VERSION_LESS 11.1) - message("-- CUDA version ${CUDA_VERSION} is older than 11.1, " - "using 32-bit API.") - add_definitions(-DCERES_CUDA_NO_64BIT_SOLVER_API) - else() - message("-- CUDA version ${CUDA_VERSION} is at least 11.1, " - "using 64-bit API.") - endif() else (CUDA_FOUND) message("-- Did not find CUDA library, disabling CUDA support.") update_cache_variable(CUDA OFF)
diff --git a/examples/nist.cc b/examples/nist.cc index a3430ca..cb0d1aa0 100644 --- a/examples/nist.cc +++ b/examples/nist.cc
@@ -101,7 +101,7 @@ "and cgnr"); DEFINE_string(dense_linear_algebra_library, "eigen", - "Options are: eigen and lapack."); + "Options are: eigen, lapack, and cuda."); DEFINE_string(preconditioner, "jacobi", "Options are: identity, jacobi"); DEFINE_string(line_search, "wolfe",
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt index 166eb19..6c6c3c1 100644 --- a/internal/ceres/CMakeLists.txt +++ b/internal/ceres/CMakeLists.txt
@@ -143,6 +143,7 @@ list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ${CUDA_LIBRARIES} + ${CUDA_cublas_LIBRARY} ${CUDA_cusolver_LIBRARY} ${CUDA_cusparse_LIBRARY}) endif (CUDA_FOUND) @@ -479,6 +480,7 @@ ceres_test(covariance) ceres_test(cubic_interpolation) ceres_test(cuda_dense_cholesky) + ceres_test(cuda_dense_qr) ceres_test(dense_linear_solver) ceres_test(dense_cholesky) ceres_test(dense_qr)
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h index 44e7de3..61f9752 100644 --- a/internal/ceres/cuda_buffer.h +++ b/internal/ceres/cuda_buffer.h
@@ -45,7 +45,8 @@ // the appropriate GPU device is selected before each subroutine is called. This // is particularly important when using multiple GPU devices on different CPU // threads, since active Cuda devices are determined by the cuda runtime on a -// per-thread basis. +// per-thread basis. Note that unless otherwise specified, all methods use the +// default stream, and are synchronous. template <typename T> class CudaBuffer { public: @@ -59,6 +60,8 @@ } } + // Grow the GPU memory buffer if needed to accommodate data of the specified + // size void Reserve(const size_t size) { if (size > size_) { if (data_ != nullptr) { @@ -69,12 +72,21 @@ } } - void CopyToGpu(const T* data, const size_t size) { + // Perform an asynchronous copy from CPU memory to GPU memory using the stream + // provided. + void CopyToGpuAsync(const T* data, const size_t size, cudaStream_t stream) { Reserve(size); - CHECK_EQ(cudaMemcpy(data_, data, size * sizeof(T), cudaMemcpyHostToDevice), + CHECK_EQ(cudaMemcpyAsync(data_, + data, + size * sizeof(T), + cudaMemcpyHostToDevice, + stream), cudaSuccess); } + // Copy data from the GPU to CPU memory. This is necessarily synchronous since + // any potential GPU kernels that may be writing to the buffer must finish + // before the transfer happens. void CopyToHost(T* data, const size_t size) { CHECK(data_ != nullptr); CHECK_EQ(cudaMemcpy(data, data_, size * sizeof(T), cudaMemcpyDeviceToHost),
diff --git a/internal/ceres/cuda_dense_cholesky_test.cc b/internal/ceres/cuda_dense_cholesky_test.cc index 7d39b0e..6c4dcc3 100644 --- a/internal/ceres/cuda_dense_cholesky_test.cc +++ b/internal/ceres/cuda_dense_cholesky_test.cc
@@ -36,21 +36,19 @@ #include "glog/logging.h" #include "gtest/gtest.h" -using std::string; - namespace ceres { namespace internal { #ifndef CERES_NO_CUDA -TEST(DenseCholeskyCudaSolverOld, InvalidOptionOnCreate) { +TEST(CUDADenseCholesky, InvalidOptionOnCreate) { LinearSolver::Options options; - auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options); + auto dense_cuda_solver = CUDADenseCholesky::Create(options); EXPECT_EQ(dense_cuda_solver, nullptr); } // Tests the CUDA Cholesky solver with a simple 4x4 matrix. -TEST(DenseCholeskyCudaSolverOld, Cholesky4x4Matrix) { +TEST(CUDADenseCholesky, Cholesky4x4Matrix) { Eigen::Matrix4d A; A << 4, 12, -16, 0, 12, 37, -43, 0, @@ -59,9 +57,9 @@ const Eigen::Vector4d b = Eigen::Vector4d::Ones(); LinearSolver::Options options; options.dense_linear_algebra_library_type = CUDA; - auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options); + auto dense_cuda_solver = CUDADenseCholesky::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); - string error_string; + std::string error_string; ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(), A.data(), &error_string), @@ -75,7 +73,7 @@ EXPECT_NEAR(x(3), 1.0000, std::numeric_limits<double>::epsilon() * 10); } -TEST(DenseCholeskyCudaSolverOld, SingularMatrix) { +TEST(CUDADenseCholesky, SingularMatrix) { Eigen::Matrix3d A; A << 1, 0, 0, 0, 1, 0, @@ -83,16 +81,16 @@ const Eigen::Vector3d b = Eigen::Vector3d::Ones(); LinearSolver::Options options; options.dense_linear_algebra_library_type = CUDA; - auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options); + auto dense_cuda_solver = CUDADenseCholesky::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); - string error_string; + std::string error_string; ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(), A.data(), &error_string), LinearSolverTerminationType::LINEAR_SOLVER_FAILURE); } -TEST(DenseCholeskyCudaSolverOld, NegativeMatrix) { +TEST(CUDADenseCholesky, NegativeMatrix) { Eigen::Matrix3d A; A << 1, 0, 0, 0, 1, 0, @@ -100,107 +98,26 @@ const Eigen::Vector3d b = Eigen::Vector3d::Ones(); LinearSolver::Options options; options.dense_linear_algebra_library_type = CUDA; - auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options); + auto dense_cuda_solver = CUDADenseCholesky::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); - string error_string; + std::string error_string; ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(), A.data(), &error_string), LinearSolverTerminationType::LINEAR_SOLVER_FAILURE); } -TEST(DenseCholeskyCudaSolverOld, MustFactorizeBeforeSolve) { +TEST(CUDADenseCholesky, MustFactorizeBeforeSolve) { const Eigen::Vector3d b = Eigen::Vector3d::Ones(); LinearSolver::Options options; options.dense_linear_algebra_library_type = CUDA; - auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options); + auto dense_cuda_solver = CUDADenseCholesky::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); - string error_string; + std::string error_string; ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string), LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR); } -#ifndef CERES_CUDA_NO_64BIT_SOLVER_API -// TODO(sameeragarwal): Convert these tests into templated tests so that we use -// the exact same code for both versions of the test. - -TEST(DenseCholeskyCudaSolverNew, InvalidOptionOnCreate) { - LinearSolver::Options options; - auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options); - EXPECT_EQ(dense_cuda_solver, nullptr); -} - -TEST(DenseCholeskyCudaSolverNew, Cholesky4x4Matrix) { - Eigen::Matrix4d A; - A << 4, 12, -16, 0, - 12, 37, -43, 0, - -16, -43, 98, 0, - 0, 0, 0, 1; - const Eigen::Vector4d b = Eigen::Vector4d::Ones(); - LinearSolver::Options options; - options.dense_linear_algebra_library_type = CUDA; - auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options); - ASSERT_NE(dense_cuda_solver, nullptr); - string error_string; - ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(), - A.data(), - &error_string), - LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS); - Eigen::Vector4d x = Eigen::Vector4d::Zero(); - ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string), - LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS); - EXPECT_NEAR(x(0), 113.75 / 3.0, std::numeric_limits<double>::epsilon() * 10); - EXPECT_NEAR(x(1), -31.0 / 3.0, std::numeric_limits<double>::epsilon() * 10); - EXPECT_NEAR(x(2), 5.0 / 3.0, std::numeric_limits<double>::epsilon() * 10); - EXPECT_NEAR(x(3), 1.0000, std::numeric_limits<double>::epsilon() * 10); -} - -TEST(DenseCholeskyCudaSolverNew, SingularMatrix) { - Eigen::Matrix3d A; - A << 1, 0, 0, - 0, 1, 0, - 0, 0, 0; - const Eigen::Vector3d b = Eigen::Vector3d::Ones(); - LinearSolver::Options options; - options.dense_linear_algebra_library_type = CUDA; - auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options); - ASSERT_NE(dense_cuda_solver, nullptr); - string error_string; - ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(), - A.data(), - &error_string), - LinearSolverTerminationType::LINEAR_SOLVER_FAILURE); -} - -TEST(DenseCholeskyCudaSolverNew, NegativeMatrix) { - Eigen::Matrix3d A; - A << 1, 0, 0, - 0, 1, 0, - 0, 0, -1; - const Eigen::Vector3d b = Eigen::Vector3d::Ones(); - LinearSolver::Options options; - options.dense_linear_algebra_library_type = CUDA; - auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options); - ASSERT_NE(dense_cuda_solver, nullptr); - string error_string; - ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(), - A.data(), - &error_string), - LinearSolverTerminationType::LINEAR_SOLVER_FAILURE); -} - -TEST(DenseCholeskyCudaSolverNew, MustFactorizeBeforeSolve) { - const Eigen::Vector3d b = Eigen::Vector3d::Ones(); - LinearSolver::Options options; - options.dense_linear_algebra_library_type = CUDA; - auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options); - ASSERT_NE(dense_cuda_solver, nullptr); - string error_string; - ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string), - LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR); -} -#endif // CERES_CUDA_NO_64BIT_SOLVER_API - #endif // CERES_NO_CUDA } // namespace internal
diff --git a/internal/ceres/cuda_dense_qr_test.cc b/internal/ceres/cuda_dense_qr_test.cc new file mode 100644 index 0000000..15ba00a --- /dev/null +++ b/internal/ceres/cuda_dense_qr_test.cc
@@ -0,0 +1,164 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2022 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Author: joydeepb@cs.utexas.edu (Joydeep Biswas) + +#include <string> + +#include "ceres/dense_qr.h" +#include "ceres/internal/eigen.h" + +#include "glog/logging.h" +#include "gtest/gtest.h" + +namespace ceres { +namespace internal { + +#ifndef CERES_NO_CUDA + +TEST(CUDADenseQR, InvalidOptionOnCreate) { + LinearSolver::Options options; + auto dense_cuda_solver = CUDADenseQR::Create(options); + EXPECT_EQ(dense_cuda_solver, nullptr); +} + +// Tests the CUDA QR solver with a simple 4x4 matrix. +TEST(CUDADenseQR, QR4x4Matrix) { + Eigen::Matrix4d A; + A << 4, 12, -16, 0, + 12, 37, -43, 0, + -16, -43, 98, 0, + 0, 0, 0, 1; + const Eigen::Vector4d b = Eigen::Vector4d::Ones(); + LinearSolver::Options options; + options.dense_linear_algebra_library_type = CUDA; + auto dense_cuda_solver = CUDADenseQR::Create(options); + ASSERT_NE(dense_cuda_solver, nullptr); + std::string error_string; + ASSERT_EQ(dense_cuda_solver->Factorize(A.rows(), + A.cols(), + A.data(), + &error_string), + LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS); + Eigen::Vector4d x = Eigen::Vector4d::Zero(); + ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string), + LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS); + // Empirically observed accuracy of cuSolverDN's QR solver. + const double kEpsilon = 1e-11; + EXPECT_NEAR(x(0), 113.75 / 3.0, kEpsilon); + EXPECT_NEAR(x(1), -31.0 / 3.0, kEpsilon); + EXPECT_NEAR(x(2), 5.0 / 3.0, kEpsilon); + EXPECT_NEAR(x(3), 1.0000, kEpsilon); +} + +// Tests the CUDA QR solver with a simple 4x4 matrix. +TEST(CUDADenseQR, QR4x2Matrix) { + Eigen::Matrix<double, 4, 2> A; + A << 4, 12, + 12, 37, + -16, -43, + 0, 0; + const std::vector<double> b(4, 1.0); + LinearSolver::Options options; + options.dense_linear_algebra_library_type = CUDA; + auto dense_cuda_solver = CUDADenseQR::Create(options); + ASSERT_NE(dense_cuda_solver, nullptr); + std::string error_string; + ASSERT_EQ(dense_cuda_solver->Factorize(A.rows(), + A.cols(), + A.data(), + &error_string), + LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS); + std::vector<double> x(2, 0); + ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string), + LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS); + // Empirically observed accuracy of cuSolverDN's QR solver. + const double kEpsilon = 1e-11; + // Solution values computed with Octave. + EXPECT_NEAR(x[0], -1.143410852713177, kEpsilon); + EXPECT_NEAR(x[1], 0.4031007751937981, kEpsilon); +} + +TEST(CUDADenseQR, MustFactorizeBeforeSolve) { + const Eigen::Vector3d b = Eigen::Vector3d::Ones(); + LinearSolver::Options options; + options.dense_linear_algebra_library_type = CUDA; + auto dense_cuda_solver = CUDADenseQR::Create(options); + ASSERT_NE(dense_cuda_solver, nullptr); + std::string error_string; + ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string), + LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR); +} + +TEST(CUDADenseQR, Randomized1600x100Tests) { + const int kNumRows = 1600; + const int kNumCols = 100; + using LhsType = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>; + using RhsType = Eigen::Matrix<double, Eigen::Dynamic, 1>; + using SolutionType = Eigen::Matrix<double, Eigen::Dynamic, 1>; + + LinearSolver::Options options; + options.dense_linear_algebra_library_type = ceres::CUDA; + std::unique_ptr<DenseQR> dense_qr = CUDADenseQR::Create(options); + + const int kNumTrials = 100; + const int kMinNumCols = 1; + const int kMaxNumCols = 10; + const int kMinRowsFactor = 1; + const int kMaxRowsFactor = 3; + for (int i = 0; i < kNumTrials; ++i) { + LhsType lhs = LhsType::Random(kNumRows, kNumCols); + SolutionType x_expected = SolutionType::Random(kNumCols); + RhsType rhs = lhs * x_expected; + SolutionType x_computed = SolutionType::Zero(kNumCols); + // Sanity check the random matrix sizes. + EXPECT_EQ(lhs.rows(), kNumRows); + EXPECT_EQ(lhs.cols(), kNumCols); + EXPECT_EQ(rhs.rows(), kNumRows); + EXPECT_EQ(rhs.cols(), 1); + EXPECT_EQ(x_expected.rows(), kNumCols); + EXPECT_EQ(x_expected.cols(), 1); + EXPECT_EQ(x_computed.rows(), kNumCols); + EXPECT_EQ(x_computed.cols(), 1); + LinearSolver::Summary summary; + summary.termination_type = dense_qr->FactorAndSolve(kNumRows, + kNumCols, + lhs.data(), + rhs.data(), + x_computed.data(), + &summary.message); + ASSERT_EQ(summary.termination_type, LINEAR_SOLVER_SUCCESS); + ASSERT_NEAR((x_computed - x_expected).norm() / x_expected.norm(), + 0.0, + std::numeric_limits<double>::epsilon() * 400); + } +} +#endif // CERES_NO_CUDA + +} // namespace internal +} // namespace ceres
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc index af555e7..2d8c2da 100644 --- a/internal/ceres/dense_cholesky.cc +++ b/internal/ceres/dense_cholesky.cc
@@ -193,13 +193,14 @@ #ifndef CERES_NO_CUDA -bool CUDADenseCholesky32Bit::Init(std::string* message) { +bool CUDADenseCholesky::Init(std::string* message) { if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) { *message = "cuSolverDN::cusolverDnCreate failed."; return false; } - if (cudaStreamCreate(&stream_) != cudaSuccess) { - *message = "cuSolverDN::cudaStreamCreate failed."; + if (cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking) != + cudaSuccess) { + *message = "CUDA::cudaStreamCreateWithFlags failed."; cusolverDnDestroy(cusolver_handle_); return false; } @@ -211,26 +212,23 @@ return false; } error_.Reserve(1); - *message = "CUDADenseCholesky64Bit::Init Success."; + *message = "CUDADenseCholesky::Init Success."; return true; } -CUDADenseCholesky32Bit::~CUDADenseCholesky32Bit() { +CUDADenseCholesky::~CUDADenseCholesky() { if (cusolver_handle_ != nullptr) { CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS); CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess); } } -LinearSolverTerminationType CUDADenseCholesky32Bit::Factorize( +LinearSolverTerminationType CUDADenseCholesky::Factorize( int num_cols, double* lhs, std::string* message) { factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; - // Allocate GPU memory if necessary. lhs_.Reserve(num_cols * num_cols); num_cols_ = num_cols; - // Copy A to GPU. - lhs_.CopyToGpu(lhs, num_cols * num_cols); - // Allocate scratch space on GPU. + lhs_.CopyToGpuAsync(lhs, num_cols * num_cols, stream_); int device_workspace_size = 0; if (cusolverDnDpotrf_bufferSize(cusolver_handle_, CUBLAS_FILL_MODE_LOWER, @@ -242,9 +240,7 @@ *message = "cuSolverDN::cusolverDnDpotrf_bufferSize failed."; return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; } - // Allocate GPU scratch memory. device_workspace_.Reserve(device_workspace_size); - // Run the actual factorization (potrf) if (cusolverDnDpotrf(cusolver_handle_, CUBLAS_FILL_MODE_LOWER, num_cols, @@ -256,13 +252,12 @@ *message = "cuSolverDN::cusolverDnDpotrf failed."; return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; } - if (cudaDeviceSynchronize() != cudaSuccess || cudaStreamSynchronize(stream_), - cudaSuccess) { + if (cudaDeviceSynchronize() != cudaSuccess || + cudaStreamSynchronize(stream_) != cudaSuccess) { *message = "Cuda device synchronization failed."; return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; } int error = 0; - // Check for errors. error_.CopyToHost(&error, 1); if (error < 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres - " @@ -285,15 +280,13 @@ return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; } -LinearSolverTerminationType CUDADenseCholesky32Bit::Solve( +LinearSolverTerminationType CUDADenseCholesky::Solve( const double* rhs, double* solution, std::string* message) { if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) { *message = "Factorize did not complete succesfully previously."; return factorize_result_; } - // Copy RHS to GPU. - rhs_.CopyToGpu(rhs, num_cols_); - // Solve the system. + rhs_.CopyToGpuAsync(rhs, num_cols_, stream_); if (cusolverDnDpotrs(cusolver_handle_, CUBLAS_FILL_MODE_LOWER, num_cols_, @@ -306,237 +299,41 @@ *message = "cuSolverDN::cusolverDnDpotrs failed."; return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; } - if (cudaDeviceSynchronize() != cudaSuccess || cudaStreamSynchronize(stream_), - cudaSuccess) { + if (cudaDeviceSynchronize() != cudaSuccess || + cudaStreamSynchronize(stream_) != cudaSuccess) { *message = "Cuda device synchronization failed."; return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; } - // Check for errors. int error = 0; - // Copy X from GPU to host. - rhs_.CopyToHost(solution, num_cols_); + error_.CopyToHost(&error, 1); if (error != 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres. " << "Please report it." << "cuSolverDN::cusolverDnDpotrs fatal error. " << "Argument: " << -error << " is invalid."; } - // Copy error variable from GPU to host. - error_.CopyToHost(&error, 1); - *message = "Success"; - return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; -} - -std::unique_ptr<CUDADenseCholesky32Bit> CUDADenseCholesky32Bit::Create( - const LinearSolver::Options& options) { - if (options.dense_linear_algebra_library_type != CUDA) { - // The user called the wrong factory method. - return nullptr; - } - CUDADenseCholesky32Bit* cuda_dense_cholesky = new CUDADenseCholesky32Bit(); - std::string cuda_error; - if (cuda_dense_cholesky->Init(&cuda_error)) { - return std::unique_ptr<CUDADenseCholesky32Bit>(cuda_dense_cholesky); - } - // Initialization failed, destroy the object and return a nullptr. - delete cuda_dense_cholesky; - LOG(ERROR) << "CUDADenseCholesky32Bit::Init failed: " << cuda_error; - return std::unique_ptr<CUDADenseCholesky32Bit>(nullptr); -} - -std::unique_ptr<CUDADenseCholesky64Bit> CUDADenseCholesky64Bit::Create( - const LinearSolver::Options& options) { - if (options.dense_linear_algebra_library_type != CUDA) { - // The user called the wrong factory method. - return nullptr; - } - CUDADenseCholesky64Bit* cuda_dense_cholesky = new CUDADenseCholesky64Bit(); - std::string cuda_error; - if (cuda_dense_cholesky->Init(&cuda_error)) { - return std::unique_ptr<CUDADenseCholesky64Bit>(cuda_dense_cholesky); - } - // Initialization failed, destroy the object and return a nullptr. - delete cuda_dense_cholesky; - LOG(ERROR) << "CUDADenseCholesky64Bit::Init failed: " << cuda_error; - return std::unique_ptr<CUDADenseCholesky64Bit>(nullptr); -} - -#ifdef CERES_CUDA_NO_64BIT_SOLVER_API -// CUDA < 11.1 did not have the 64-bit APIs, so the implementation of the new -// interface will just generate a fatal error. - -CUDADenseCholesky64Bit::~CUDADenseCholesky64Bit() {} - -bool CUDADenseCholesky64Bit::Init(std::string* message) { - *message = "Cannot use CUDADenseCholesky64Bit with CUDA < 11.1."; - return false; -} - -LinearSolverTerminationType CUDADenseCholesky64Bit::Factorize(int, - double*, - std::string*) { - return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; -} - -LinearSolverTerminationType CUDADenseCholesky64Bit::Solve(const double*, - double*, - std::string*) { - return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; -} - -#else // CERES_CUDA_NO_64BIT_SOLVER_API - -bool CUDADenseCholesky64Bit::Init(std::string* message) { - if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) { - *message = "cuSolverDN::cusolverDnCreate failed."; - return false; - } - if (cudaStreamCreate(&stream_) != cudaSuccess) { - *message = "cuSolverDN::cudaStreamCreate failed."; - cusolverDnDestroy(cusolver_handle_); - return false; - } - if (cusolverDnSetStream(cusolver_handle_, stream_) != - CUSOLVER_STATUS_SUCCESS) { - *message = "cuSolverDN::cusolverDnSetStream failed."; - cudaStreamDestroy(stream_); - cusolverDnDestroy(cusolver_handle_); - return false; - } - error_.Reserve(1); - *message = "CUDADenseCholesky64Bit::Init Success."; - return true; -} - -CUDADenseCholesky64Bit::~CUDADenseCholesky64Bit() { - if (cusolver_handle_ != nullptr) { - CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS); - CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess); - } -} - -LinearSolverTerminationType CUDADenseCholesky64Bit::Factorize( - int num_cols, double* lhs, std::string* message) { - factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; - // Allocate GPU memory if necessary. - lhs_.Reserve(num_cols * num_cols); - num_cols_ = num_cols; - // Copy A to GPU. - lhs_.CopyToGpu(lhs, num_cols * num_cols); - // Allocate scratch space on GPU. - size_t host_workspace_size = 0; - size_t device_workspace_size = 0; - if (cusolverDnXpotrf_bufferSize(cusolver_handle_, - nullptr, - CUBLAS_FILL_MODE_LOWER, - num_cols, - CUDA_R_64F, - lhs_.data(), - num_cols, - CUDA_R_64F, - &device_workspace_size, - &host_workspace_size) != - CUSOLVER_STATUS_SUCCESS) { - *message = "cuSolverDN::cusolverDnXpotrf_bufferSize failed."; - return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; - } - // Allocate host scratch memory. - host_workspace_.resize(host_workspace_size); - // Allocate GPU scratch memory. - device_workspace_.Reserve(device_workspace_size); - // Run the actual factorization (potrf) - if (cusolverDnXpotrf(cusolver_handle_, - nullptr, - CUBLAS_FILL_MODE_LOWER, - num_cols, - CUDA_R_64F, - lhs_.data(), - num_cols, - CUDA_R_64F, - device_workspace_.data(), - device_workspace_.size(), - host_workspace_.data(), - host_workspace_.size(), - error_.data()) != CUSOLVER_STATUS_SUCCESS) { - *message = "cuSolverDN::cusolverDnXpotrf failed."; - return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; - } - if (cudaDeviceSynchronize() != cudaSuccess || - cudaStreamSynchronize(stream_) != cudaSuccess) { - *message = "Cuda device synchronization failed."; - return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; - } - int error = 0; - // Check for errors. - error_.CopyToHost(&error, 1); - if (error < 0) { - LOG(FATAL) << "Congratulations, you found a bug in Ceres - " - << "please report it. " - << "cuSolverDN::cusolverDnXpotrf fatal error. " - << "Argument: " << -error << " is invalid."; - // The following line is unreachable, but return failure just to be - // pedantic, since the compiler does not know that. - return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; - } else if (error > 0) { - *message = StringPrintf( - "cuSolverDN::cusolverDnXpotrf numerical failure. " - "The leading minor of order %d is not positive definite.", - error); - factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FAILURE; - return LinearSolverTerminationType::LINEAR_SOLVER_FAILURE; - } - - *message = "Success"; - factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; - return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; -} - -LinearSolverTerminationType CUDADenseCholesky64Bit::Solve( - const double* rhs, double* solution, std::string* message) { - if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) { - *message = "Factorize did not complete succesfully previously."; - return factorize_result_; - } - // Copy RHS to GPU. - rhs_.CopyToGpu(rhs, num_cols_); - // Solve the system. - if (cusolverDnXpotrs(cusolver_handle_, - nullptr, - CUBLAS_FILL_MODE_LOWER, - num_cols_, - 1, - CUDA_R_64F, - lhs_.data(), - num_cols_, - CUDA_R_64F, - rhs_.data(), - num_cols_, - error_.data()) != CUSOLVER_STATUS_SUCCESS) { - *message = "cuSolverDN::cusolverDnXpotrs failed."; - return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; - } - if (cudaDeviceSynchronize() != cudaSuccess || - cudaStreamSynchronize(stream_) != cudaSuccess) { - *message = "Cuda device synchronization failed."; - return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; - } - // Check for errors. - int error = 0; - // Copy error variable from GPU to host. - error_.CopyToHost(&error, 1); - if (error != 0) { - LOG(FATAL) << "Congratulations, you found a bug in Ceres. " - << "Please report it. " - << "cuSolverDN::cusolverDnXpotrs fatal error. " - << "Argument: " << -error << " is invalid."; - } - // Copy X from GPU to host. rhs_.CopyToHost(solution, num_cols_); *message = "Success"; return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; } -#endif // CERES_CUDA_NO_64BIT_SOLVER_API +std::unique_ptr<CUDADenseCholesky> CUDADenseCholesky::Create( + const LinearSolver::Options& options) { + if (options.dense_linear_algebra_library_type != CUDA) { + // The user called the wrong factory method. + return nullptr; + } + auto cuda_dense_cholesky = + std::unique_ptr<CUDADenseCholesky>(new CUDADenseCholesky()); + std::string cuda_error; + if (cuda_dense_cholesky->Init(&cuda_error)) { + return cuda_dense_cholesky; + } + // Initialization failed, destroy the object (done automatically) and return a + // nullptr. + LOG(ERROR) << "CUDADenseCholesky::Init failed: " << cuda_error; + return nullptr; +} #endif // CERES_NO_CUDA
diff --git a/internal/ceres/dense_cholesky.h b/internal/ceres/dense_cholesky.h index f7e48d4..49d780c 100644 --- a/internal/ceres/dense_cholesky.h +++ b/internal/ceres/dense_cholesky.h
@@ -134,15 +134,15 @@ #endif // CERES_NO_LAPACK #ifndef CERES_NO_CUDA -// Implementation of DenseCholesky using the cuSolver library v.11.0 or older, -// using the legacy cuSolverDn interface. -class CERES_NO_EXPORT CUDADenseCholesky32Bit : public DenseCholesky { +// CUDA implementation of DenseCholesky using the cuSolverDN library using the +// 32-bit legacy interface for maximum compatibility. +class CERES_NO_EXPORT CUDADenseCholesky : public DenseCholesky { public: - static std::unique_ptr<CUDADenseCholesky32Bit> Create( + static std::unique_ptr<CUDADenseCholesky> Create( const LinearSolver::Options& options); - ~CUDADenseCholesky32Bit() override; - CUDADenseCholesky32Bit(const CUDADenseCholesky32Bit&) = delete; - CUDADenseCholesky32Bit& operator=(const CUDADenseCholesky32Bit&) = delete; + ~CUDADenseCholesky() override; + CUDADenseCholesky(const CUDADenseCholesky&) = delete; + CUDADenseCholesky& operator=(const CUDADenseCholesky&) = delete; LinearSolverTerminationType Factorize(int num_cols, double* lhs, std::string* message) override; @@ -151,7 +151,7 @@ std::string* message) override; private: - CUDADenseCholesky32Bit() = default; + CUDADenseCholesky() = default; // Initializes the cuSolverDN context, creates an asynchronous stream, and // associates the stream with cuSolverDN. Returns true iff initialization was // successful, else it returns false and a human-readable error message is @@ -179,59 +179,6 @@ LINEAR_SOLVER_FATAL_ERROR; }; -// Implementation of DenseCholesky using the cuSolver library v.11.1 or newer, -// using the 64-bit cuSolverDn interface. -class CERES_NO_EXPORT CUDADenseCholesky64Bit : public DenseCholesky { - public: - static std::unique_ptr<CUDADenseCholesky64Bit> Create( - const LinearSolver::Options& options); - ~CUDADenseCholesky64Bit() override; - CUDADenseCholesky64Bit(const CUDADenseCholesky64Bit&) = delete; - CUDADenseCholesky64Bit& operator=(const CUDADenseCholesky64Bit&) = delete; - LinearSolverTerminationType Factorize(int num_cols, - double* lhs, - std::string* message) override; - LinearSolverTerminationType Solve(const double* rhs, - double* solution, - std::string* message) override; - - private: - CUDADenseCholesky64Bit() = default; - // Initializes the cuSolverDN context, creates an asynchronous stream, and - // associates the stream with cuSolverDN. Returns true iff initialization was - // successful, else it returns false and a human-readable error message is - // returned. - bool Init(std::string* message); - - // Handle to the cuSOLVER context. - cusolverDnHandle_t cusolver_handle_ = nullptr; - // CUDA device stream. - cudaStream_t stream_ = nullptr; - // Number of columns in the A matrix, to be cached between calls to Factorize - // and Solve. - size_t num_cols_ = 0; - // GPU memory allocated for the A matrix (lhs matrix). - CudaBuffer<double> lhs_; - // GPU memory allocated for the B matrix (rhs vector). - CudaBuffer<double> rhs_; - // Workspace for cuSOLVER on the GPU. - CudaBuffer<uint8_t> device_workspace_; - // Workspace for cuSOLVER on the host. - std::vector<double> host_workspace_; - // Required for error handling with cuSOLVER. - CudaBuffer<int> error_; - // Cache the result of Factorize to ensure that when Solve is called, the - // factiorization of lhs is valid. - LinearSolverTerminationType factorize_result_ = - LINEAR_SOLVER_FATAL_ERROR; -}; - -#ifdef CERES_CUDA_NO_64BIT_SOLVER_API -using CUDADenseCholesky = CUDADenseCholesky32Bit; -#else -using CUDADenseCholesky = CUDADenseCholesky64Bit; -#endif - #endif // CERES_NO_CUDA } // namespace internal
diff --git a/internal/ceres/dense_linear_solver_benchmark.cc b/internal/ceres/dense_linear_solver_benchmark.cc index 2af9fa8..ba171bf 100644 --- a/internal/ceres/dense_linear_solver_benchmark.cc +++ b/internal/ceres/dense_linear_solver_benchmark.cc
@@ -98,7 +98,9 @@ #endif // CERES_NO_LAPACK #ifndef CERES_NO_CUDA -BENCHMARK(BM_DenseSolver<ceres::CUDA, ceres::DENSE_NORMAL_CHOLESKY>) +BENCHMARK_TEMPLATE2(BM_DenseSolver, ceres::CUDA, ceres::DENSE_NORMAL_CHOLESKY) + ->Apply(MatrixSizes); +BENCHMARK_TEMPLATE2(BM_DenseSolver, ceres::CUDA, ceres::DENSE_QR) ->Apply(MatrixSizes); #endif // CERES_NO_CUDA
diff --git a/internal/ceres/dense_qr.cc b/internal/ceres/dense_qr.cc index 33a7cbd..77f04ad 100644 --- a/internal/ceres/dense_qr.cc +++ b/internal/ceres/dense_qr.cc
@@ -33,6 +33,10 @@ #include <algorithm> #include <memory> #include <string> +#ifndef CERES_NO_CUDA +#include "cusolverDn.h" +#include "cublas_v2.h" +#endif // CERES_NO_CUDA #ifndef CERES_NO_LAPACK @@ -124,6 +128,14 @@ LOG(FATAL) << "Ceres was compiled without support for LAPACK."; #endif + case CUDA: +#ifndef CERES_NO_CUDA + dense_qr = CUDADenseQR::Create(options); + break; +#else + LOG(FATAL) << "Ceres was compiled without support for CUDA."; +#endif + default: LOG(FATAL) << "Unknown dense linear algebra library type : " << DenseLinearAlgebraLibraryTypeToString( @@ -296,5 +308,207 @@ #endif // CERES_NO_LAPACK +#ifndef CERES_NO_CUDA + +bool CUDADenseQR::Init(std::string* message) { + if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { + *message = "cuBLAS::cublasCreate failed."; + return false; + } + if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnCreate failed."; + return false; + } + if (cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking) != + cudaSuccess) { + *message = "CUDA::cudaStreamCreateWithFlags failed."; + cusolverDnDestroy(cusolver_handle_); + cublasDestroy(cublas_handle_); + return false; + } + if (cusolverDnSetStream(cusolver_handle_, stream_) != CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnSetStream failed."; + cusolverDnDestroy(cusolver_handle_); + cudaStreamDestroy(stream_); + cublasDestroy(cublas_handle_); + return false; + } + if (cublasSetStream(cublas_handle_, stream_) != CUBLAS_STATUS_SUCCESS) { + *message = "cuBLAS::cublasSetStream failed."; + cusolverDnDestroy(cusolver_handle_); + cublasDestroy(cublas_handle_); + cudaStreamDestroy(stream_); + return false; + } + error_.Reserve(1); + *message = "CUDADenseQR::Init Success."; + return true; +} + +CUDADenseQR::~CUDADenseQR() { + if (cublas_handle_ != nullptr) { + CHECK_EQ(cublasDestroy(cublas_handle_), CUBLAS_STATUS_SUCCESS); + } + if (cusolver_handle_ != nullptr) { + CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS); + } + if (stream_ != nullptr) { + CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess); + } +} + +LinearSolverTerminationType CUDADenseQR::Factorize( + int num_rows, int num_cols, double* lhs, std::string* message) { + factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + lhs_.Reserve(num_rows * num_cols); + tau_.Reserve(std::min(num_rows, num_cols)); + num_rows_ = num_rows; + num_cols_ = num_cols; + lhs_.CopyToGpuAsync(lhs, num_rows * num_cols, stream_); + int device_workspace_size = 0; + if (cusolverDnDgeqrf_bufferSize(cusolver_handle_, + num_rows, + num_cols, + lhs_.data(), + num_rows, + &device_workspace_size) != + CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnDgeqrf_bufferSize failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + device_workspace_.Reserve(device_workspace_size); + if (cusolverDnDgeqrf(cusolver_handle_, + num_rows, + num_cols, + lhs_.data(), + num_rows, + tau_.data(), + reinterpret_cast<double*>(device_workspace_.data()), + device_workspace_.size(), + error_.data()) != CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnDgeqrf failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + if (cudaDeviceSynchronize() != cudaSuccess || + cudaStreamSynchronize(stream_) != cudaSuccess) { + *message = "Cuda device synchronization failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + int error = 0; + error_.CopyToHost(&error, 1); + if (error < 0) { + LOG(FATAL) << "Congratulations, you found a bug in Ceres - " + << "please report it. " + << "cuSolverDN::cusolverDnDgeqrf fatal error. " + << "Argument: " << -error << " is invalid."; + // The following line is unreachable, but return failure just to be + // pedantic, since the compiler does not know that. + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + + *message = "Success"; + factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; + return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; +} + +LinearSolverTerminationType CUDADenseQR::Solve( + const double* rhs, double* solution, std::string* message) { + if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) { + *message = "Factorize did not complete succesfully previously."; + return factorize_result_; + } + rhs_.CopyToGpuAsync(rhs, num_rows_, stream_); + int device_workspace_size = 0; + if (cusolverDnDormqr_bufferSize(cusolver_handle_, + CUBLAS_SIDE_LEFT, + CUBLAS_OP_T, + num_rows_, + 1, + num_cols_, + lhs_.data(), + num_rows_, + tau_.data(), + rhs_.data(), + num_rows_, + &device_workspace_size) != + CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnDormqr_bufferSize failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + device_workspace_.Reserve(device_workspace_size); + // Compute rhs = Q^T * rhs, assuming that lhs has already been factorized. + // The result of factorization would have stored Q in a packed form in lhs_. + if (cusolverDnDormqr(cusolver_handle_, + CUBLAS_SIDE_LEFT, + CUBLAS_OP_T, + num_rows_, + 1, + num_cols_, + lhs_.data(), + num_rows_, + tau_.data(), + rhs_.data(), + num_rows_, + reinterpret_cast<double*>(device_workspace_.data()), + device_workspace_.size(), + error_.data()) != CUSOLVER_STATUS_SUCCESS) { + *message = "cuSolverDN::cusolverDnDormqr failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + int error = 0; + error_.CopyToHost(&error, 1); + if (error < 0) { + LOG(FATAL) << "Congratulations, you found a bug in Ceres. " + << "Please report it." + << "cuSolverDN::cusolverDnDormqr fatal error. " + << "Argument: " << -error << " is invalid."; + } + // Compute the solution vector as x = R \ (Q^T * rhs). Since the previous step + // replaced rhs by (Q^T * rhs), this is just x = R \ rhs. + if (cublasDtrsv(cublas_handle_, + CUBLAS_FILL_MODE_UPPER, + CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, + num_cols_, + lhs_.data(), + num_rows_, + rhs_.data(), + 1) != CUBLAS_STATUS_SUCCESS) { + *message = "cuBLAS::cublasDtrsv failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + if (cudaDeviceSynchronize() != cudaSuccess || + cudaStreamSynchronize(stream_) != cudaSuccess) { + *message = "Cuda device synchronization failed."; + return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR; + } + rhs_.CopyToHost(solution, num_cols_); + *message = "Success"; + return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS; +} + +std::unique_ptr<CUDADenseQR> CUDADenseQR::Create( + const LinearSolver::Options& options) { + if (options.dense_linear_algebra_library_type != CUDA) { + // The user called the wrong factory method. + return nullptr; + } + auto cuda_dense_qr = + std::unique_ptr<CUDADenseQR>(new CUDADenseQR()); + std::string cuda_error; + if (cuda_dense_qr->Init(&cuda_error)) { + return cuda_dense_qr; + } + // Initialization failed, destroy the object (done automatically) and return a + // nullptr. + LOG(ERROR) << "CUDADenseQR::Init failed: " << cuda_error; + return nullptr; +} + +CUDADenseQR::CUDADenseQR() = default; + +#endif // CERES_NO_CUDA + + } // namespace internal } // namespace ceres
diff --git a/internal/ceres/dense_qr.h b/internal/ceres/dense_qr.h index aafb067..8bcccf0 100644 --- a/internal/ceres/dense_qr.h +++ b/internal/ceres/dense_qr.h
@@ -45,6 +45,12 @@ #include "ceres/internal/export.h" #include "ceres/linear_solver.h" #include "glog/logging.h" +#ifndef CERES_NO_CUDA +#include "ceres/cuda_buffer.h" +#include "cuda_runtime.h" +#include "cublas_v2.h" +#include "cusolverDn.h" +#endif // CERES_NO_CUDA namespace ceres { namespace internal { @@ -137,6 +143,65 @@ }; #endif // CERES_NO_LAPACK +#ifndef CERES_NO_CUDA +// Implementation of DenseQR using the 32-bit cuSolverDn interface. A +// requirement for using this solver is that the lhs must not be rank deficient. +// This is because cuSolverDn does not implement the singularity-checking +// wrapper trtrs, hence this solver directly uses trsv from CUBLAS for the +// backsubstitution. +class CERES_NO_EXPORT CUDADenseQR : public DenseQR { + public: + static std::unique_ptr<CUDADenseQR> Create( + const LinearSolver::Options& options); + ~CUDADenseQR() override; + CUDADenseQR(const CUDADenseQR&) = delete; + CUDADenseQR& operator=(const CUDADenseQR&) = delete; + LinearSolverTerminationType Factorize(int num_rows, + int num_cols, + double* lhs, + std::string* message) override; + LinearSolverTerminationType Solve(const double* rhs, + double* solution, + std::string* message) override; + + private: + CUDADenseQR(); + // Initializes the cuSolverDN context, creates an asynchronous stream, and + // associates the stream with cuSolverDN. Returns true iff initialization was + // successful, else it returns false and a human-readable error message is + // returned. + bool Init(std::string* message); + + // Handle to the cuSOLVER context. + cusolverDnHandle_t cusolver_handle_ = nullptr; + // Handle to cuBLAS context. + cublasHandle_t cublas_handle_ = nullptr; + // CUDA device stream. + cudaStream_t stream_ = nullptr; + // Number of rowns in the A matrix, to be cached between calls to *Factorize + // and *Solve. + size_t num_rows_ = 0; + // Number of columns in the A matrix, to be cached between calls to *Factorize + // and *Solve. + size_t num_cols_ = 0; + // GPU memory allocated for the A matrix (lhs matrix). + CudaBuffer<double> lhs_; + // GPU memory allocated for the B matrix (rhs vector). + CudaBuffer<double> rhs_; + // GPU memory allocated for the TAU matrix (scaling of householder vectors). + CudaBuffer<double> tau_; + // Scratch space for cuSOLVER on the GPU. + CudaBuffer<uint8_t> device_workspace_; + // Required for error handling with cuSOLVER. + CudaBuffer<int> error_; + // Cache the result of Factorize to ensure that when Solve is called, the + // factiorization of lhs is valid. + LinearSolverTerminationType factorize_result_ = + LINEAR_SOLVER_FATAL_ERROR; +}; + +#endif // CERES_NO_CUDA + } // namespace internal } // namespace ceres
diff --git a/internal/ceres/dense_qr_test.cc b/internal/ceres/dense_qr_test.cc index ac1258d..d9d307e 100644 --- a/internal/ceres/dense_qr_test.cc +++ b/internal/ceres/dense_qr_test.cc
@@ -33,6 +33,7 @@ #include <memory> #include <numeric> #include <string> +#include <tuple> #include <vector> #include "Eigen/Dense" @@ -67,6 +68,7 @@ LinearSolver::Options options; options.dense_linear_algebra_library_type = GetParam(); + const double kEpsilon = std::numeric_limits<double>::epsilon() * 1.5e4; std::unique_ptr<DenseQR> dense_qr = DenseQR::Create(options); const int kNumTrials = 10; @@ -83,7 +85,6 @@ Vector x = VectorType::Random(num_cols); Vector rhs = lhs * x; Vector actual = Vector::Random(num_cols); - LinearSolver::Summary summary; summary.termination_type = dense_qr->FactorAndSolve(num_rows, num_cols, @@ -91,10 +92,8 @@ rhs.data(), actual.data(), &summary.message); - EXPECT_EQ(summary.termination_type, LINEAR_SOLVER_SUCCESS); - EXPECT_NEAR((x - actual).norm() / x.norm(), - 0.0, - std::numeric_limits<double>::epsilon() * 400) + ASSERT_EQ(summary.termination_type, LINEAR_SOLVER_SUCCESS); + ASSERT_NEAR((x - actual).norm() / x.norm(), 0.0, kEpsilon) << "\nexpected: " << x.transpose() << "\nactual : " << actual.transpose(); } @@ -102,17 +101,26 @@ } } +namespace { + +// NOTE: preprocessor directives in a macro are not standard conforming +decltype(auto) MakeValues() { + return ::testing::Values( + EIGEN #ifndef CERES_NO_LAPACK -INSTANTIATE_TEST_SUITE_P(_, - DenseQRTest, - ::testing::Values(EIGEN, LAPACK), - ParamInfoToString); -#else -INSTANTIATE_TEST_SUITE_P(_, - DenseQRTest, - ::testing::Values(EIGEN), - ParamInfoToString); + , + LAPACK #endif +#ifndef CERES_NO_CUDA + , + CUDA +#endif + ); +} + +} // namespace + +INSTANTIATE_TEST_SUITE_P(_, DenseQRTest, MakeValues(), ParamInfoToString); } // namespace internal } // namespace ceres