Add support for dense CUDA solvers #2
1. Add CUDADenseQR & tests.
CUDADenseQR uses the cuSolverDN LAPACK implementation
of QR factorization. A key limitation, however, is that
this solver does not perform singularity checking --
this is because cuSolverDN does not have a trtrs
implementation; we instead use cuBLAS' trsv for
backsubstitution.
2. All CPU -> GPU memory transfers are now async, and both
CUDADenseQR and CUDADenseCholesky explicitly manage their
own streams for async operations.
3. Simplified CUDADenseCholesky to only use the legacy 32-bit
cuSolverDN API.
Change-Id: I2a9b7b65469658ddfe33b5b2a3892c8744d6e437
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 48ba201..69432f7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -270,14 +270,6 @@
"${CUDA_LIBRARIES};"
"${CUDA_cusolver_LIBRARY};"
"${CUDA_cusparse_LIBRARY}")
- if (CUDA_VERSION VERSION_LESS 11.1)
- message("-- CUDA version ${CUDA_VERSION} is older than 11.1, "
- "using 32-bit API.")
- add_definitions(-DCERES_CUDA_NO_64BIT_SOLVER_API)
- else()
- message("-- CUDA version ${CUDA_VERSION} is at least 11.1, "
- "using 64-bit API.")
- endif()
else (CUDA_FOUND)
message("-- Did not find CUDA library, disabling CUDA support.")
update_cache_variable(CUDA OFF)
diff --git a/examples/nist.cc b/examples/nist.cc
index a3430ca..cb0d1aa0 100644
--- a/examples/nist.cc
+++ b/examples/nist.cc
@@ -101,7 +101,7 @@
"and cgnr");
DEFINE_string(dense_linear_algebra_library,
"eigen",
- "Options are: eigen and lapack.");
+ "Options are: eigen, lapack, and cuda.");
DEFINE_string(preconditioner, "jacobi", "Options are: identity, jacobi");
DEFINE_string(line_search,
"wolfe",
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 166eb19..6c6c3c1 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -143,6 +143,7 @@
list(APPEND
CERES_LIBRARY_PRIVATE_DEPENDENCIES
${CUDA_LIBRARIES}
+ ${CUDA_cublas_LIBRARY}
${CUDA_cusolver_LIBRARY}
${CUDA_cusparse_LIBRARY})
endif (CUDA_FOUND)
@@ -479,6 +480,7 @@
ceres_test(covariance)
ceres_test(cubic_interpolation)
ceres_test(cuda_dense_cholesky)
+ ceres_test(cuda_dense_qr)
ceres_test(dense_linear_solver)
ceres_test(dense_cholesky)
ceres_test(dense_qr)
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h
index 44e7de3..61f9752 100644
--- a/internal/ceres/cuda_buffer.h
+++ b/internal/ceres/cuda_buffer.h
@@ -45,7 +45,8 @@
// the appropriate GPU device is selected before each subroutine is called. This
// is particularly important when using multiple GPU devices on different CPU
// threads, since active Cuda devices are determined by the cuda runtime on a
-// per-thread basis.
+// per-thread basis. Note that unless otherwise specified, all methods use the
+// default stream, and are synchronous.
template <typename T>
class CudaBuffer {
public:
@@ -59,6 +60,8 @@
}
}
+ // Grow the GPU memory buffer if needed to accommodate data of the specified
+ // size
void Reserve(const size_t size) {
if (size > size_) {
if (data_ != nullptr) {
@@ -69,12 +72,21 @@
}
}
- void CopyToGpu(const T* data, const size_t size) {
+ // Perform an asynchronous copy from CPU memory to GPU memory using the stream
+ // provided.
+ void CopyToGpuAsync(const T* data, const size_t size, cudaStream_t stream) {
Reserve(size);
- CHECK_EQ(cudaMemcpy(data_, data, size * sizeof(T), cudaMemcpyHostToDevice),
+ CHECK_EQ(cudaMemcpyAsync(data_,
+ data,
+ size * sizeof(T),
+ cudaMemcpyHostToDevice,
+ stream),
cudaSuccess);
}
+ // Copy data from the GPU to CPU memory. This is necessarily synchronous since
+ // any potential GPU kernels that may be writing to the buffer must finish
+ // before the transfer happens.
void CopyToHost(T* data, const size_t size) {
CHECK(data_ != nullptr);
CHECK_EQ(cudaMemcpy(data, data_, size * sizeof(T), cudaMemcpyDeviceToHost),
diff --git a/internal/ceres/cuda_dense_cholesky_test.cc b/internal/ceres/cuda_dense_cholesky_test.cc
index 7d39b0e..6c4dcc3 100644
--- a/internal/ceres/cuda_dense_cholesky_test.cc
+++ b/internal/ceres/cuda_dense_cholesky_test.cc
@@ -36,21 +36,19 @@
#include "glog/logging.h"
#include "gtest/gtest.h"
-using std::string;
-
namespace ceres {
namespace internal {
#ifndef CERES_NO_CUDA
-TEST(DenseCholeskyCudaSolverOld, InvalidOptionOnCreate) {
+TEST(CUDADenseCholesky, InvalidOptionOnCreate) {
LinearSolver::Options options;
- auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+ auto dense_cuda_solver = CUDADenseCholesky::Create(options);
EXPECT_EQ(dense_cuda_solver, nullptr);
}
// Tests the CUDA Cholesky solver with a simple 4x4 matrix.
-TEST(DenseCholeskyCudaSolverOld, Cholesky4x4Matrix) {
+TEST(CUDADenseCholesky, Cholesky4x4Matrix) {
Eigen::Matrix4d A;
A << 4, 12, -16, 0,
12, 37, -43, 0,
@@ -59,9 +57,9 @@
const Eigen::Vector4d b = Eigen::Vector4d::Ones();
LinearSolver::Options options;
options.dense_linear_algebra_library_type = CUDA;
- auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+ auto dense_cuda_solver = CUDADenseCholesky::Create(options);
ASSERT_NE(dense_cuda_solver, nullptr);
- string error_string;
+ std::string error_string;
ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
A.data(),
&error_string),
@@ -75,7 +73,7 @@
EXPECT_NEAR(x(3), 1.0000, std::numeric_limits<double>::epsilon() * 10);
}
-TEST(DenseCholeskyCudaSolverOld, SingularMatrix) {
+TEST(CUDADenseCholesky, SingularMatrix) {
Eigen::Matrix3d A;
A << 1, 0, 0,
0, 1, 0,
@@ -83,16 +81,16 @@
const Eigen::Vector3d b = Eigen::Vector3d::Ones();
LinearSolver::Options options;
options.dense_linear_algebra_library_type = CUDA;
- auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+ auto dense_cuda_solver = CUDADenseCholesky::Create(options);
ASSERT_NE(dense_cuda_solver, nullptr);
- string error_string;
+ std::string error_string;
ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
A.data(),
&error_string),
LinearSolverTerminationType::LINEAR_SOLVER_FAILURE);
}
-TEST(DenseCholeskyCudaSolverOld, NegativeMatrix) {
+TEST(CUDADenseCholesky, NegativeMatrix) {
Eigen::Matrix3d A;
A << 1, 0, 0,
0, 1, 0,
@@ -100,107 +98,26 @@
const Eigen::Vector3d b = Eigen::Vector3d::Ones();
LinearSolver::Options options;
options.dense_linear_algebra_library_type = CUDA;
- auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+ auto dense_cuda_solver = CUDADenseCholesky::Create(options);
ASSERT_NE(dense_cuda_solver, nullptr);
- string error_string;
+ std::string error_string;
ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
A.data(),
&error_string),
LinearSolverTerminationType::LINEAR_SOLVER_FAILURE);
}
-TEST(DenseCholeskyCudaSolverOld, MustFactorizeBeforeSolve) {
+TEST(CUDADenseCholesky, MustFactorizeBeforeSolve) {
const Eigen::Vector3d b = Eigen::Vector3d::Ones();
LinearSolver::Options options;
options.dense_linear_algebra_library_type = CUDA;
- auto dense_cuda_solver = CUDADenseCholesky32Bit::Create(options);
+ auto dense_cuda_solver = CUDADenseCholesky::Create(options);
ASSERT_NE(dense_cuda_solver, nullptr);
- string error_string;
+ std::string error_string;
ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string),
LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR);
}
-#ifndef CERES_CUDA_NO_64BIT_SOLVER_API
-// TODO(sameeragarwal): Convert these tests into templated tests so that we use
-// the exact same code for both versions of the test.
-
-TEST(DenseCholeskyCudaSolverNew, InvalidOptionOnCreate) {
- LinearSolver::Options options;
- auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
- EXPECT_EQ(dense_cuda_solver, nullptr);
-}
-
-TEST(DenseCholeskyCudaSolverNew, Cholesky4x4Matrix) {
- Eigen::Matrix4d A;
- A << 4, 12, -16, 0,
- 12, 37, -43, 0,
- -16, -43, 98, 0,
- 0, 0, 0, 1;
- const Eigen::Vector4d b = Eigen::Vector4d::Ones();
- LinearSolver::Options options;
- options.dense_linear_algebra_library_type = CUDA;
- auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
- ASSERT_NE(dense_cuda_solver, nullptr);
- string error_string;
- ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
- A.data(),
- &error_string),
- LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
- Eigen::Vector4d x = Eigen::Vector4d::Zero();
- ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string),
- LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
- EXPECT_NEAR(x(0), 113.75 / 3.0, std::numeric_limits<double>::epsilon() * 10);
- EXPECT_NEAR(x(1), -31.0 / 3.0, std::numeric_limits<double>::epsilon() * 10);
- EXPECT_NEAR(x(2), 5.0 / 3.0, std::numeric_limits<double>::epsilon() * 10);
- EXPECT_NEAR(x(3), 1.0000, std::numeric_limits<double>::epsilon() * 10);
-}
-
-TEST(DenseCholeskyCudaSolverNew, SingularMatrix) {
- Eigen::Matrix3d A;
- A << 1, 0, 0,
- 0, 1, 0,
- 0, 0, 0;
- const Eigen::Vector3d b = Eigen::Vector3d::Ones();
- LinearSolver::Options options;
- options.dense_linear_algebra_library_type = CUDA;
- auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
- ASSERT_NE(dense_cuda_solver, nullptr);
- string error_string;
- ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
- A.data(),
- &error_string),
- LinearSolverTerminationType::LINEAR_SOLVER_FAILURE);
-}
-
-TEST(DenseCholeskyCudaSolverNew, NegativeMatrix) {
- Eigen::Matrix3d A;
- A << 1, 0, 0,
- 0, 1, 0,
- 0, 0, -1;
- const Eigen::Vector3d b = Eigen::Vector3d::Ones();
- LinearSolver::Options options;
- options.dense_linear_algebra_library_type = CUDA;
- auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
- ASSERT_NE(dense_cuda_solver, nullptr);
- string error_string;
- ASSERT_EQ(dense_cuda_solver->Factorize(A.cols(),
- A.data(),
- &error_string),
- LinearSolverTerminationType::LINEAR_SOLVER_FAILURE);
-}
-
-TEST(DenseCholeskyCudaSolverNew, MustFactorizeBeforeSolve) {
- const Eigen::Vector3d b = Eigen::Vector3d::Ones();
- LinearSolver::Options options;
- options.dense_linear_algebra_library_type = CUDA;
- auto dense_cuda_solver = CUDADenseCholesky64Bit::Create(options);
- ASSERT_NE(dense_cuda_solver, nullptr);
- string error_string;
- ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string),
- LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR);
-}
-#endif // CERES_CUDA_NO_64BIT_SOLVER_API
-
#endif // CERES_NO_CUDA
} // namespace internal
diff --git a/internal/ceres/cuda_dense_qr_test.cc b/internal/ceres/cuda_dense_qr_test.cc
new file mode 100644
index 0000000..15ba00a
--- /dev/null
+++ b/internal/ceres/cuda_dense_qr_test.cc
@@ -0,0 +1,164 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include <string>
+
+#include "ceres/dense_qr.h"
+#include "ceres/internal/eigen.h"
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace ceres {
+namespace internal {
+
+#ifndef CERES_NO_CUDA
+
+TEST(CUDADenseQR, InvalidOptionOnCreate) {
+ LinearSolver::Options options;
+ auto dense_cuda_solver = CUDADenseQR::Create(options);
+ EXPECT_EQ(dense_cuda_solver, nullptr);
+}
+
+// Tests the CUDA QR solver with a simple 4x4 matrix.
+TEST(CUDADenseQR, QR4x4Matrix) {
+ Eigen::Matrix4d A;
+ A << 4, 12, -16, 0,
+ 12, 37, -43, 0,
+ -16, -43, 98, 0,
+ 0, 0, 0, 1;
+ const Eigen::Vector4d b = Eigen::Vector4d::Ones();
+ LinearSolver::Options options;
+ options.dense_linear_algebra_library_type = CUDA;
+ auto dense_cuda_solver = CUDADenseQR::Create(options);
+ ASSERT_NE(dense_cuda_solver, nullptr);
+ std::string error_string;
+ ASSERT_EQ(dense_cuda_solver->Factorize(A.rows(),
+ A.cols(),
+ A.data(),
+ &error_string),
+ LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
+ Eigen::Vector4d x = Eigen::Vector4d::Zero();
+ ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string),
+ LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
+ // Empirically observed accuracy of cuSolverDN's QR solver.
+ const double kEpsilon = 1e-11;
+ EXPECT_NEAR(x(0), 113.75 / 3.0, kEpsilon);
+ EXPECT_NEAR(x(1), -31.0 / 3.0, kEpsilon);
+ EXPECT_NEAR(x(2), 5.0 / 3.0, kEpsilon);
+ EXPECT_NEAR(x(3), 1.0000, kEpsilon);
+}
+
+// Tests the CUDA QR solver with a simple 4x4 matrix.
+TEST(CUDADenseQR, QR4x2Matrix) {
+ Eigen::Matrix<double, 4, 2> A;
+ A << 4, 12,
+ 12, 37,
+ -16, -43,
+ 0, 0;
+ const std::vector<double> b(4, 1.0);
+ LinearSolver::Options options;
+ options.dense_linear_algebra_library_type = CUDA;
+ auto dense_cuda_solver = CUDADenseQR::Create(options);
+ ASSERT_NE(dense_cuda_solver, nullptr);
+ std::string error_string;
+ ASSERT_EQ(dense_cuda_solver->Factorize(A.rows(),
+ A.cols(),
+ A.data(),
+ &error_string),
+ LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
+ std::vector<double> x(2, 0);
+ ASSERT_EQ(dense_cuda_solver->Solve(b.data(), x.data(), &error_string),
+ LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS);
+ // Empirically observed accuracy of cuSolverDN's QR solver.
+ const double kEpsilon = 1e-11;
+ // Solution values computed with Octave.
+ EXPECT_NEAR(x[0], -1.143410852713177, kEpsilon);
+ EXPECT_NEAR(x[1], 0.4031007751937981, kEpsilon);
+}
+
+TEST(CUDADenseQR, MustFactorizeBeforeSolve) {
+ const Eigen::Vector3d b = Eigen::Vector3d::Ones();
+ LinearSolver::Options options;
+ options.dense_linear_algebra_library_type = CUDA;
+ auto dense_cuda_solver = CUDADenseQR::Create(options);
+ ASSERT_NE(dense_cuda_solver, nullptr);
+ std::string error_string;
+ ASSERT_EQ(dense_cuda_solver->Solve(b.data(), nullptr, &error_string),
+ LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR);
+}
+
+TEST(CUDADenseQR, Randomized1600x100Tests) {
+ const int kNumRows = 1600;
+ const int kNumCols = 100;
+ using LhsType = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>;
+ using RhsType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+ using SolutionType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+
+ LinearSolver::Options options;
+ options.dense_linear_algebra_library_type = ceres::CUDA;
+ std::unique_ptr<DenseQR> dense_qr = CUDADenseQR::Create(options);
+
+ const int kNumTrials = 100;
+ const int kMinNumCols = 1;
+ const int kMaxNumCols = 10;
+ const int kMinRowsFactor = 1;
+ const int kMaxRowsFactor = 3;
+ for (int i = 0; i < kNumTrials; ++i) {
+ LhsType lhs = LhsType::Random(kNumRows, kNumCols);
+ SolutionType x_expected = SolutionType::Random(kNumCols);
+ RhsType rhs = lhs * x_expected;
+ SolutionType x_computed = SolutionType::Zero(kNumCols);
+ // Sanity check the random matrix sizes.
+ EXPECT_EQ(lhs.rows(), kNumRows);
+ EXPECT_EQ(lhs.cols(), kNumCols);
+ EXPECT_EQ(rhs.rows(), kNumRows);
+ EXPECT_EQ(rhs.cols(), 1);
+ EXPECT_EQ(x_expected.rows(), kNumCols);
+ EXPECT_EQ(x_expected.cols(), 1);
+ EXPECT_EQ(x_computed.rows(), kNumCols);
+ EXPECT_EQ(x_computed.cols(), 1);
+ LinearSolver::Summary summary;
+ summary.termination_type = dense_qr->FactorAndSolve(kNumRows,
+ kNumCols,
+ lhs.data(),
+ rhs.data(),
+ x_computed.data(),
+ &summary.message);
+ ASSERT_EQ(summary.termination_type, LINEAR_SOLVER_SUCCESS);
+ ASSERT_NEAR((x_computed - x_expected).norm() / x_expected.norm(),
+ 0.0,
+ std::numeric_limits<double>::epsilon() * 400);
+ }
+}
+#endif // CERES_NO_CUDA
+
+} // namespace internal
+} // namespace ceres
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index af555e7..2d8c2da 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc
@@ -193,13 +193,14 @@
#ifndef CERES_NO_CUDA
-bool CUDADenseCholesky32Bit::Init(std::string* message) {
+bool CUDADenseCholesky::Init(std::string* message) {
if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) {
*message = "cuSolverDN::cusolverDnCreate failed.";
return false;
}
- if (cudaStreamCreate(&stream_) != cudaSuccess) {
- *message = "cuSolverDN::cudaStreamCreate failed.";
+ if (cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking) !=
+ cudaSuccess) {
+ *message = "CUDA::cudaStreamCreateWithFlags failed.";
cusolverDnDestroy(cusolver_handle_);
return false;
}
@@ -211,26 +212,23 @@
return false;
}
error_.Reserve(1);
- *message = "CUDADenseCholesky64Bit::Init Success.";
+ *message = "CUDADenseCholesky::Init Success.";
return true;
}
-CUDADenseCholesky32Bit::~CUDADenseCholesky32Bit() {
+CUDADenseCholesky::~CUDADenseCholesky() {
if (cusolver_handle_ != nullptr) {
CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS);
CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess);
}
}
-LinearSolverTerminationType CUDADenseCholesky32Bit::Factorize(
+LinearSolverTerminationType CUDADenseCholesky::Factorize(
int num_cols, double* lhs, std::string* message) {
factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
- // Allocate GPU memory if necessary.
lhs_.Reserve(num_cols * num_cols);
num_cols_ = num_cols;
- // Copy A to GPU.
- lhs_.CopyToGpu(lhs, num_cols * num_cols);
- // Allocate scratch space on GPU.
+ lhs_.CopyToGpuAsync(lhs, num_cols * num_cols, stream_);
int device_workspace_size = 0;
if (cusolverDnDpotrf_bufferSize(cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
@@ -242,9 +240,7 @@
*message = "cuSolverDN::cusolverDnDpotrf_bufferSize failed.";
return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
}
- // Allocate GPU scratch memory.
device_workspace_.Reserve(device_workspace_size);
- // Run the actual factorization (potrf)
if (cusolverDnDpotrf(cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols,
@@ -256,13 +252,12 @@
*message = "cuSolverDN::cusolverDnDpotrf failed.";
return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
}
- if (cudaDeviceSynchronize() != cudaSuccess || cudaStreamSynchronize(stream_),
- cudaSuccess) {
+ if (cudaDeviceSynchronize() != cudaSuccess ||
+ cudaStreamSynchronize(stream_) != cudaSuccess) {
*message = "Cuda device synchronization failed.";
return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
}
int error = 0;
- // Check for errors.
error_.CopyToHost(&error, 1);
if (error < 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
@@ -285,15 +280,13 @@
return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
}
-LinearSolverTerminationType CUDADenseCholesky32Bit::Solve(
+LinearSolverTerminationType CUDADenseCholesky::Solve(
const double* rhs, double* solution, std::string* message) {
if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) {
*message = "Factorize did not complete succesfully previously.";
return factorize_result_;
}
- // Copy RHS to GPU.
- rhs_.CopyToGpu(rhs, num_cols_);
- // Solve the system.
+ rhs_.CopyToGpuAsync(rhs, num_cols_, stream_);
if (cusolverDnDpotrs(cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols_,
@@ -306,237 +299,41 @@
*message = "cuSolverDN::cusolverDnDpotrs failed.";
return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
}
- if (cudaDeviceSynchronize() != cudaSuccess || cudaStreamSynchronize(stream_),
- cudaSuccess) {
+ if (cudaDeviceSynchronize() != cudaSuccess ||
+ cudaStreamSynchronize(stream_) != cudaSuccess) {
*message = "Cuda device synchronization failed.";
return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
}
- // Check for errors.
int error = 0;
- // Copy X from GPU to host.
- rhs_.CopyToHost(solution, num_cols_);
+ error_.CopyToHost(&error, 1);
if (error != 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
<< "Please report it."
<< "cuSolverDN::cusolverDnDpotrs fatal error. "
<< "Argument: " << -error << " is invalid.";
}
- // Copy error variable from GPU to host.
- error_.CopyToHost(&error, 1);
- *message = "Success";
- return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
-}
-
-std::unique_ptr<CUDADenseCholesky32Bit> CUDADenseCholesky32Bit::Create(
- const LinearSolver::Options& options) {
- if (options.dense_linear_algebra_library_type != CUDA) {
- // The user called the wrong factory method.
- return nullptr;
- }
- CUDADenseCholesky32Bit* cuda_dense_cholesky = new CUDADenseCholesky32Bit();
- std::string cuda_error;
- if (cuda_dense_cholesky->Init(&cuda_error)) {
- return std::unique_ptr<CUDADenseCholesky32Bit>(cuda_dense_cholesky);
- }
- // Initialization failed, destroy the object and return a nullptr.
- delete cuda_dense_cholesky;
- LOG(ERROR) << "CUDADenseCholesky32Bit::Init failed: " << cuda_error;
- return std::unique_ptr<CUDADenseCholesky32Bit>(nullptr);
-}
-
-std::unique_ptr<CUDADenseCholesky64Bit> CUDADenseCholesky64Bit::Create(
- const LinearSolver::Options& options) {
- if (options.dense_linear_algebra_library_type != CUDA) {
- // The user called the wrong factory method.
- return nullptr;
- }
- CUDADenseCholesky64Bit* cuda_dense_cholesky = new CUDADenseCholesky64Bit();
- std::string cuda_error;
- if (cuda_dense_cholesky->Init(&cuda_error)) {
- return std::unique_ptr<CUDADenseCholesky64Bit>(cuda_dense_cholesky);
- }
- // Initialization failed, destroy the object and return a nullptr.
- delete cuda_dense_cholesky;
- LOG(ERROR) << "CUDADenseCholesky64Bit::Init failed: " << cuda_error;
- return std::unique_ptr<CUDADenseCholesky64Bit>(nullptr);
-}
-
-#ifdef CERES_CUDA_NO_64BIT_SOLVER_API
-// CUDA < 11.1 did not have the 64-bit APIs, so the implementation of the new
-// interface will just generate a fatal error.
-
-CUDADenseCholesky64Bit::~CUDADenseCholesky64Bit() {}
-
-bool CUDADenseCholesky64Bit::Init(std::string* message) {
- *message = "Cannot use CUDADenseCholesky64Bit with CUDA < 11.1.";
- return false;
-}
-
-LinearSolverTerminationType CUDADenseCholesky64Bit::Factorize(int,
- double*,
- std::string*) {
- return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
-}
-
-LinearSolverTerminationType CUDADenseCholesky64Bit::Solve(const double*,
- double*,
- std::string*) {
- return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
-}
-
-#else // CERES_CUDA_NO_64BIT_SOLVER_API
-
-bool CUDADenseCholesky64Bit::Init(std::string* message) {
- if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) {
- *message = "cuSolverDN::cusolverDnCreate failed.";
- return false;
- }
- if (cudaStreamCreate(&stream_) != cudaSuccess) {
- *message = "cuSolverDN::cudaStreamCreate failed.";
- cusolverDnDestroy(cusolver_handle_);
- return false;
- }
- if (cusolverDnSetStream(cusolver_handle_, stream_) !=
- CUSOLVER_STATUS_SUCCESS) {
- *message = "cuSolverDN::cusolverDnSetStream failed.";
- cudaStreamDestroy(stream_);
- cusolverDnDestroy(cusolver_handle_);
- return false;
- }
- error_.Reserve(1);
- *message = "CUDADenseCholesky64Bit::Init Success.";
- return true;
-}
-
-CUDADenseCholesky64Bit::~CUDADenseCholesky64Bit() {
- if (cusolver_handle_ != nullptr) {
- CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS);
- CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess);
- }
-}
-
-LinearSolverTerminationType CUDADenseCholesky64Bit::Factorize(
- int num_cols, double* lhs, std::string* message) {
- factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
- // Allocate GPU memory if necessary.
- lhs_.Reserve(num_cols * num_cols);
- num_cols_ = num_cols;
- // Copy A to GPU.
- lhs_.CopyToGpu(lhs, num_cols * num_cols);
- // Allocate scratch space on GPU.
- size_t host_workspace_size = 0;
- size_t device_workspace_size = 0;
- if (cusolverDnXpotrf_bufferSize(cusolver_handle_,
- nullptr,
- CUBLAS_FILL_MODE_LOWER,
- num_cols,
- CUDA_R_64F,
- lhs_.data(),
- num_cols,
- CUDA_R_64F,
- &device_workspace_size,
- &host_workspace_size) !=
- CUSOLVER_STATUS_SUCCESS) {
- *message = "cuSolverDN::cusolverDnXpotrf_bufferSize failed.";
- return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
- }
- // Allocate host scratch memory.
- host_workspace_.resize(host_workspace_size);
- // Allocate GPU scratch memory.
- device_workspace_.Reserve(device_workspace_size);
- // Run the actual factorization (potrf)
- if (cusolverDnXpotrf(cusolver_handle_,
- nullptr,
- CUBLAS_FILL_MODE_LOWER,
- num_cols,
- CUDA_R_64F,
- lhs_.data(),
- num_cols,
- CUDA_R_64F,
- device_workspace_.data(),
- device_workspace_.size(),
- host_workspace_.data(),
- host_workspace_.size(),
- error_.data()) != CUSOLVER_STATUS_SUCCESS) {
- *message = "cuSolverDN::cusolverDnXpotrf failed.";
- return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
- }
- if (cudaDeviceSynchronize() != cudaSuccess ||
- cudaStreamSynchronize(stream_) != cudaSuccess) {
- *message = "Cuda device synchronization failed.";
- return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
- }
- int error = 0;
- // Check for errors.
- error_.CopyToHost(&error, 1);
- if (error < 0) {
- LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
- << "please report it. "
- << "cuSolverDN::cusolverDnXpotrf fatal error. "
- << "Argument: " << -error << " is invalid.";
- // The following line is unreachable, but return failure just to be
- // pedantic, since the compiler does not know that.
- return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
- } else if (error > 0) {
- *message = StringPrintf(
- "cuSolverDN::cusolverDnXpotrf numerical failure. "
- "The leading minor of order %d is not positive definite.",
- error);
- factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FAILURE;
- return LinearSolverTerminationType::LINEAR_SOLVER_FAILURE;
- }
-
- *message = "Success";
- factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
- return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
-}
-
-LinearSolverTerminationType CUDADenseCholesky64Bit::Solve(
- const double* rhs, double* solution, std::string* message) {
- if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) {
- *message = "Factorize did not complete succesfully previously.";
- return factorize_result_;
- }
- // Copy RHS to GPU.
- rhs_.CopyToGpu(rhs, num_cols_);
- // Solve the system.
- if (cusolverDnXpotrs(cusolver_handle_,
- nullptr,
- CUBLAS_FILL_MODE_LOWER,
- num_cols_,
- 1,
- CUDA_R_64F,
- lhs_.data(),
- num_cols_,
- CUDA_R_64F,
- rhs_.data(),
- num_cols_,
- error_.data()) != CUSOLVER_STATUS_SUCCESS) {
- *message = "cuSolverDN::cusolverDnXpotrs failed.";
- return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
- }
- if (cudaDeviceSynchronize() != cudaSuccess ||
- cudaStreamSynchronize(stream_) != cudaSuccess) {
- *message = "Cuda device synchronization failed.";
- return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
- }
- // Check for errors.
- int error = 0;
- // Copy error variable from GPU to host.
- error_.CopyToHost(&error, 1);
- if (error != 0) {
- LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
- << "Please report it. "
- << "cuSolverDN::cusolverDnXpotrs fatal error. "
- << "Argument: " << -error << " is invalid.";
- }
- // Copy X from GPU to host.
rhs_.CopyToHost(solution, num_cols_);
*message = "Success";
return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
}
-#endif // CERES_CUDA_NO_64BIT_SOLVER_API
+std::unique_ptr<CUDADenseCholesky> CUDADenseCholesky::Create(
+ const LinearSolver::Options& options) {
+ if (options.dense_linear_algebra_library_type != CUDA) {
+ // The user called the wrong factory method.
+ return nullptr;
+ }
+ auto cuda_dense_cholesky =
+ std::unique_ptr<CUDADenseCholesky>(new CUDADenseCholesky());
+ std::string cuda_error;
+ if (cuda_dense_cholesky->Init(&cuda_error)) {
+ return cuda_dense_cholesky;
+ }
+ // Initialization failed, destroy the object (done automatically) and return a
+ // nullptr.
+ LOG(ERROR) << "CUDADenseCholesky::Init failed: " << cuda_error;
+ return nullptr;
+}
#endif // CERES_NO_CUDA
diff --git a/internal/ceres/dense_cholesky.h b/internal/ceres/dense_cholesky.h
index f7e48d4..49d780c 100644
--- a/internal/ceres/dense_cholesky.h
+++ b/internal/ceres/dense_cholesky.h
@@ -134,15 +134,15 @@
#endif // CERES_NO_LAPACK
#ifndef CERES_NO_CUDA
-// Implementation of DenseCholesky using the cuSolver library v.11.0 or older,
-// using the legacy cuSolverDn interface.
-class CERES_NO_EXPORT CUDADenseCholesky32Bit : public DenseCholesky {
+// CUDA implementation of DenseCholesky using the cuSolverDN library using the
+// 32-bit legacy interface for maximum compatibility.
+class CERES_NO_EXPORT CUDADenseCholesky : public DenseCholesky {
public:
- static std::unique_ptr<CUDADenseCholesky32Bit> Create(
+ static std::unique_ptr<CUDADenseCholesky> Create(
const LinearSolver::Options& options);
- ~CUDADenseCholesky32Bit() override;
- CUDADenseCholesky32Bit(const CUDADenseCholesky32Bit&) = delete;
- CUDADenseCholesky32Bit& operator=(const CUDADenseCholesky32Bit&) = delete;
+ ~CUDADenseCholesky() override;
+ CUDADenseCholesky(const CUDADenseCholesky&) = delete;
+ CUDADenseCholesky& operator=(const CUDADenseCholesky&) = delete;
LinearSolverTerminationType Factorize(int num_cols,
double* lhs,
std::string* message) override;
@@ -151,7 +151,7 @@
std::string* message) override;
private:
- CUDADenseCholesky32Bit() = default;
+ CUDADenseCholesky() = default;
// Initializes the cuSolverDN context, creates an asynchronous stream, and
// associates the stream with cuSolverDN. Returns true iff initialization was
// successful, else it returns false and a human-readable error message is
@@ -179,59 +179,6 @@
LINEAR_SOLVER_FATAL_ERROR;
};
-// Implementation of DenseCholesky using the cuSolver library v.11.1 or newer,
-// using the 64-bit cuSolverDn interface.
-class CERES_NO_EXPORT CUDADenseCholesky64Bit : public DenseCholesky {
- public:
- static std::unique_ptr<CUDADenseCholesky64Bit> Create(
- const LinearSolver::Options& options);
- ~CUDADenseCholesky64Bit() override;
- CUDADenseCholesky64Bit(const CUDADenseCholesky64Bit&) = delete;
- CUDADenseCholesky64Bit& operator=(const CUDADenseCholesky64Bit&) = delete;
- LinearSolverTerminationType Factorize(int num_cols,
- double* lhs,
- std::string* message) override;
- LinearSolverTerminationType Solve(const double* rhs,
- double* solution,
- std::string* message) override;
-
- private:
- CUDADenseCholesky64Bit() = default;
- // Initializes the cuSolverDN context, creates an asynchronous stream, and
- // associates the stream with cuSolverDN. Returns true iff initialization was
- // successful, else it returns false and a human-readable error message is
- // returned.
- bool Init(std::string* message);
-
- // Handle to the cuSOLVER context.
- cusolverDnHandle_t cusolver_handle_ = nullptr;
- // CUDA device stream.
- cudaStream_t stream_ = nullptr;
- // Number of columns in the A matrix, to be cached between calls to Factorize
- // and Solve.
- size_t num_cols_ = 0;
- // GPU memory allocated for the A matrix (lhs matrix).
- CudaBuffer<double> lhs_;
- // GPU memory allocated for the B matrix (rhs vector).
- CudaBuffer<double> rhs_;
- // Workspace for cuSOLVER on the GPU.
- CudaBuffer<uint8_t> device_workspace_;
- // Workspace for cuSOLVER on the host.
- std::vector<double> host_workspace_;
- // Required for error handling with cuSOLVER.
- CudaBuffer<int> error_;
- // Cache the result of Factorize to ensure that when Solve is called, the
- // factiorization of lhs is valid.
- LinearSolverTerminationType factorize_result_ =
- LINEAR_SOLVER_FATAL_ERROR;
-};
-
-#ifdef CERES_CUDA_NO_64BIT_SOLVER_API
-using CUDADenseCholesky = CUDADenseCholesky32Bit;
-#else
-using CUDADenseCholesky = CUDADenseCholesky64Bit;
-#endif
-
#endif // CERES_NO_CUDA
} // namespace internal
diff --git a/internal/ceres/dense_linear_solver_benchmark.cc b/internal/ceres/dense_linear_solver_benchmark.cc
index 2af9fa8..ba171bf 100644
--- a/internal/ceres/dense_linear_solver_benchmark.cc
+++ b/internal/ceres/dense_linear_solver_benchmark.cc
@@ -98,7 +98,9 @@
#endif // CERES_NO_LAPACK
#ifndef CERES_NO_CUDA
-BENCHMARK(BM_DenseSolver<ceres::CUDA, ceres::DENSE_NORMAL_CHOLESKY>)
+BENCHMARK_TEMPLATE2(BM_DenseSolver, ceres::CUDA, ceres::DENSE_NORMAL_CHOLESKY)
+ ->Apply(MatrixSizes);
+BENCHMARK_TEMPLATE2(BM_DenseSolver, ceres::CUDA, ceres::DENSE_QR)
->Apply(MatrixSizes);
#endif // CERES_NO_CUDA
diff --git a/internal/ceres/dense_qr.cc b/internal/ceres/dense_qr.cc
index 33a7cbd..77f04ad 100644
--- a/internal/ceres/dense_qr.cc
+++ b/internal/ceres/dense_qr.cc
@@ -33,6 +33,10 @@
#include <algorithm>
#include <memory>
#include <string>
+#ifndef CERES_NO_CUDA
+#include "cusolverDn.h"
+#include "cublas_v2.h"
+#endif // CERES_NO_CUDA
#ifndef CERES_NO_LAPACK
@@ -124,6 +128,14 @@
LOG(FATAL) << "Ceres was compiled without support for LAPACK.";
#endif
+ case CUDA:
+#ifndef CERES_NO_CUDA
+ dense_qr = CUDADenseQR::Create(options);
+ break;
+#else
+ LOG(FATAL) << "Ceres was compiled without support for CUDA.";
+#endif
+
default:
LOG(FATAL) << "Unknown dense linear algebra library type : "
<< DenseLinearAlgebraLibraryTypeToString(
@@ -296,5 +308,207 @@
#endif // CERES_NO_LAPACK
+#ifndef CERES_NO_CUDA
+
+bool CUDADenseQR::Init(std::string* message) {
+ if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
+ *message = "cuBLAS::cublasCreate failed.";
+ return false;
+ }
+ if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnCreate failed.";
+ return false;
+ }
+ if (cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking) !=
+ cudaSuccess) {
+ *message = "CUDA::cudaStreamCreateWithFlags failed.";
+ cusolverDnDestroy(cusolver_handle_);
+ cublasDestroy(cublas_handle_);
+ return false;
+ }
+ if (cusolverDnSetStream(cusolver_handle_, stream_) != CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnSetStream failed.";
+ cusolverDnDestroy(cusolver_handle_);
+ cudaStreamDestroy(stream_);
+ cublasDestroy(cublas_handle_);
+ return false;
+ }
+ if (cublasSetStream(cublas_handle_, stream_) != CUBLAS_STATUS_SUCCESS) {
+ *message = "cuBLAS::cublasSetStream failed.";
+ cusolverDnDestroy(cusolver_handle_);
+ cublasDestroy(cublas_handle_);
+ cudaStreamDestroy(stream_);
+ return false;
+ }
+ error_.Reserve(1);
+ *message = "CUDADenseQR::Init Success.";
+ return true;
+}
+
+CUDADenseQR::~CUDADenseQR() {
+ if (cublas_handle_ != nullptr) {
+ CHECK_EQ(cublasDestroy(cublas_handle_), CUBLAS_STATUS_SUCCESS);
+ }
+ if (cusolver_handle_ != nullptr) {
+ CHECK_EQ(cusolverDnDestroy(cusolver_handle_), CUSOLVER_STATUS_SUCCESS);
+ }
+ if (stream_ != nullptr) {
+ CHECK_EQ(cudaStreamDestroy(stream_), cudaSuccess);
+ }
+}
+
+LinearSolverTerminationType CUDADenseQR::Factorize(
+ int num_rows, int num_cols, double* lhs, std::string* message) {
+ factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ lhs_.Reserve(num_rows * num_cols);
+ tau_.Reserve(std::min(num_rows, num_cols));
+ num_rows_ = num_rows;
+ num_cols_ = num_cols;
+ lhs_.CopyToGpuAsync(lhs, num_rows * num_cols, stream_);
+ int device_workspace_size = 0;
+ if (cusolverDnDgeqrf_bufferSize(cusolver_handle_,
+ num_rows,
+ num_cols,
+ lhs_.data(),
+ num_rows,
+ &device_workspace_size) !=
+ CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnDgeqrf_bufferSize failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ device_workspace_.Reserve(device_workspace_size);
+ if (cusolverDnDgeqrf(cusolver_handle_,
+ num_rows,
+ num_cols,
+ lhs_.data(),
+ num_rows,
+ tau_.data(),
+ reinterpret_cast<double*>(device_workspace_.data()),
+ device_workspace_.size(),
+ error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnDgeqrf failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ if (cudaDeviceSynchronize() != cudaSuccess ||
+ cudaStreamSynchronize(stream_) != cudaSuccess) {
+ *message = "Cuda device synchronization failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ int error = 0;
+ error_.CopyToHost(&error, 1);
+ if (error < 0) {
+ LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
+ << "please report it. "
+ << "cuSolverDN::cusolverDnDgeqrf fatal error. "
+ << "Argument: " << -error << " is invalid.";
+ // The following line is unreachable, but return failure just to be
+ // pedantic, since the compiler does not know that.
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+
+ *message = "Success";
+ factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+ return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+}
+
+LinearSolverTerminationType CUDADenseQR::Solve(
+ const double* rhs, double* solution, std::string* message) {
+ if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) {
+ *message = "Factorize did not complete succesfully previously.";
+ return factorize_result_;
+ }
+ rhs_.CopyToGpuAsync(rhs, num_rows_, stream_);
+ int device_workspace_size = 0;
+ if (cusolverDnDormqr_bufferSize(cusolver_handle_,
+ CUBLAS_SIDE_LEFT,
+ CUBLAS_OP_T,
+ num_rows_,
+ 1,
+ num_cols_,
+ lhs_.data(),
+ num_rows_,
+ tau_.data(),
+ rhs_.data(),
+ num_rows_,
+ &device_workspace_size) !=
+ CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnDormqr_bufferSize failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ device_workspace_.Reserve(device_workspace_size);
+ // Compute rhs = Q^T * rhs, assuming that lhs has already been factorized.
+ // The result of factorization would have stored Q in a packed form in lhs_.
+ if (cusolverDnDormqr(cusolver_handle_,
+ CUBLAS_SIDE_LEFT,
+ CUBLAS_OP_T,
+ num_rows_,
+ 1,
+ num_cols_,
+ lhs_.data(),
+ num_rows_,
+ tau_.data(),
+ rhs_.data(),
+ num_rows_,
+ reinterpret_cast<double*>(device_workspace_.data()),
+ device_workspace_.size(),
+ error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnDormqr failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ int error = 0;
+ error_.CopyToHost(&error, 1);
+ if (error < 0) {
+ LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
+ << "Please report it."
+ << "cuSolverDN::cusolverDnDormqr fatal error. "
+ << "Argument: " << -error << " is invalid.";
+ }
+ // Compute the solution vector as x = R \ (Q^T * rhs). Since the previous step
+ // replaced rhs by (Q^T * rhs), this is just x = R \ rhs.
+ if (cublasDtrsv(cublas_handle_,
+ CUBLAS_FILL_MODE_UPPER,
+ CUBLAS_OP_N,
+ CUBLAS_DIAG_NON_UNIT,
+ num_cols_,
+ lhs_.data(),
+ num_rows_,
+ rhs_.data(),
+ 1) != CUBLAS_STATUS_SUCCESS) {
+ *message = "cuBLAS::cublasDtrsv failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ if (cudaDeviceSynchronize() != cudaSuccess ||
+ cudaStreamSynchronize(stream_) != cudaSuccess) {
+ *message = "Cuda device synchronization failed.";
+ return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
+ }
+ rhs_.CopyToHost(solution, num_cols_);
+ *message = "Success";
+ return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
+}
+
+std::unique_ptr<CUDADenseQR> CUDADenseQR::Create(
+ const LinearSolver::Options& options) {
+ if (options.dense_linear_algebra_library_type != CUDA) {
+ // The user called the wrong factory method.
+ return nullptr;
+ }
+ auto cuda_dense_qr =
+ std::unique_ptr<CUDADenseQR>(new CUDADenseQR());
+ std::string cuda_error;
+ if (cuda_dense_qr->Init(&cuda_error)) {
+ return cuda_dense_qr;
+ }
+ // Initialization failed, destroy the object (done automatically) and return a
+ // nullptr.
+ LOG(ERROR) << "CUDADenseQR::Init failed: " << cuda_error;
+ return nullptr;
+}
+
+CUDADenseQR::CUDADenseQR() = default;
+
+#endif // CERES_NO_CUDA
+
+
} // namespace internal
} // namespace ceres
diff --git a/internal/ceres/dense_qr.h b/internal/ceres/dense_qr.h
index aafb067..8bcccf0 100644
--- a/internal/ceres/dense_qr.h
+++ b/internal/ceres/dense_qr.h
@@ -45,6 +45,12 @@
#include "ceres/internal/export.h"
#include "ceres/linear_solver.h"
#include "glog/logging.h"
+#ifndef CERES_NO_CUDA
+#include "ceres/cuda_buffer.h"
+#include "cuda_runtime.h"
+#include "cublas_v2.h"
+#include "cusolverDn.h"
+#endif // CERES_NO_CUDA
namespace ceres {
namespace internal {
@@ -137,6 +143,65 @@
};
#endif // CERES_NO_LAPACK
+#ifndef CERES_NO_CUDA
+// Implementation of DenseQR using the 32-bit cuSolverDn interface. A
+// requirement for using this solver is that the lhs must not be rank deficient.
+// This is because cuSolverDn does not implement the singularity-checking
+// wrapper trtrs, hence this solver directly uses trsv from CUBLAS for the
+// backsubstitution.
+class CERES_NO_EXPORT CUDADenseQR : public DenseQR {
+ public:
+ static std::unique_ptr<CUDADenseQR> Create(
+ const LinearSolver::Options& options);
+ ~CUDADenseQR() override;
+ CUDADenseQR(const CUDADenseQR&) = delete;
+ CUDADenseQR& operator=(const CUDADenseQR&) = delete;
+ LinearSolverTerminationType Factorize(int num_rows,
+ int num_cols,
+ double* lhs,
+ std::string* message) override;
+ LinearSolverTerminationType Solve(const double* rhs,
+ double* solution,
+ std::string* message) override;
+
+ private:
+ CUDADenseQR();
+ // Initializes the cuSolverDN context, creates an asynchronous stream, and
+ // associates the stream with cuSolverDN. Returns true iff initialization was
+ // successful, else it returns false and a human-readable error message is
+ // returned.
+ bool Init(std::string* message);
+
+ // Handle to the cuSOLVER context.
+ cusolverDnHandle_t cusolver_handle_ = nullptr;
+ // Handle to cuBLAS context.
+ cublasHandle_t cublas_handle_ = nullptr;
+ // CUDA device stream.
+ cudaStream_t stream_ = nullptr;
+ // Number of rowns in the A matrix, to be cached between calls to *Factorize
+ // and *Solve.
+ size_t num_rows_ = 0;
+ // Number of columns in the A matrix, to be cached between calls to *Factorize
+ // and *Solve.
+ size_t num_cols_ = 0;
+ // GPU memory allocated for the A matrix (lhs matrix).
+ CudaBuffer<double> lhs_;
+ // GPU memory allocated for the B matrix (rhs vector).
+ CudaBuffer<double> rhs_;
+ // GPU memory allocated for the TAU matrix (scaling of householder vectors).
+ CudaBuffer<double> tau_;
+ // Scratch space for cuSOLVER on the GPU.
+ CudaBuffer<uint8_t> device_workspace_;
+ // Required for error handling with cuSOLVER.
+ CudaBuffer<int> error_;
+ // Cache the result of Factorize to ensure that when Solve is called, the
+ // factiorization of lhs is valid.
+ LinearSolverTerminationType factorize_result_ =
+ LINEAR_SOLVER_FATAL_ERROR;
+};
+
+#endif // CERES_NO_CUDA
+
} // namespace internal
} // namespace ceres
diff --git a/internal/ceres/dense_qr_test.cc b/internal/ceres/dense_qr_test.cc
index ac1258d..d9d307e 100644
--- a/internal/ceres/dense_qr_test.cc
+++ b/internal/ceres/dense_qr_test.cc
@@ -33,6 +33,7 @@
#include <memory>
#include <numeric>
#include <string>
+#include <tuple>
#include <vector>
#include "Eigen/Dense"
@@ -67,6 +68,7 @@
LinearSolver::Options options;
options.dense_linear_algebra_library_type = GetParam();
+ const double kEpsilon = std::numeric_limits<double>::epsilon() * 1.5e4;
std::unique_ptr<DenseQR> dense_qr = DenseQR::Create(options);
const int kNumTrials = 10;
@@ -83,7 +85,6 @@
Vector x = VectorType::Random(num_cols);
Vector rhs = lhs * x;
Vector actual = Vector::Random(num_cols);
-
LinearSolver::Summary summary;
summary.termination_type = dense_qr->FactorAndSolve(num_rows,
num_cols,
@@ -91,10 +92,8 @@
rhs.data(),
actual.data(),
&summary.message);
- EXPECT_EQ(summary.termination_type, LINEAR_SOLVER_SUCCESS);
- EXPECT_NEAR((x - actual).norm() / x.norm(),
- 0.0,
- std::numeric_limits<double>::epsilon() * 400)
+ ASSERT_EQ(summary.termination_type, LINEAR_SOLVER_SUCCESS);
+ ASSERT_NEAR((x - actual).norm() / x.norm(), 0.0, kEpsilon)
<< "\nexpected: " << x.transpose()
<< "\nactual : " << actual.transpose();
}
@@ -102,17 +101,26 @@
}
}
+namespace {
+
+// NOTE: preprocessor directives in a macro are not standard conforming
+decltype(auto) MakeValues() {
+ return ::testing::Values(
+ EIGEN
#ifndef CERES_NO_LAPACK
-INSTANTIATE_TEST_SUITE_P(_,
- DenseQRTest,
- ::testing::Values(EIGEN, LAPACK),
- ParamInfoToString);
-#else
-INSTANTIATE_TEST_SUITE_P(_,
- DenseQRTest,
- ::testing::Values(EIGEN),
- ParamInfoToString);
+ ,
+ LAPACK
#endif
+#ifndef CERES_NO_CUDA
+ ,
+ CUDA
+#endif
+ );
+}
+
+} // namespace
+
+INSTANTIATE_TEST_SUITE_P(_, DenseQRTest, MakeValues(), ParamInfoToString);
} // namespace internal
} // namespace ceres