CUDA CGNR, Part 4: CudaCgnrSolver
* Added CudaCgnrSolver, a new CUDA-accelerated CGNR.
* To use CudaCgnrSolver, the user must select CGNR as the linear_solver
and CUDA_SPARSE as the sparse_linear_algebra_library.
* Updated ConjugateGradientSolver to work with an array of pointers to
scratch to support CudaVectors as scratch.
* Moved CUDA initialization to run in Solver::Solve as needed.
Some performance comparisons on an Ubuntu 20.04 desktop with an
Intel i9-9940X CPU @ 3.30GHz, and an nVidia Quadro RTX 6000,
all configurations run with 24 threads, and 10 iterations.
=================================================
CGNR + CUDA_SPARSE + IDENTITY Preconditioner
problem-1778-993923-pre.txt
=================================================
Cost:
Initial 2.563973e+08
Final 1.724755e+06
Change 2.546725e+08
Minimizer iterations 11
Successful steps 7
Unsuccessful steps 4
Time (in seconds):
Preprocessor 4.020158
Residual only evaluation 1.567092 (10)
Jacobian & residual evaluation 7.847130 (7)
Linear solver 31.688898 (10)
Minimizer 46.834987
Postprocessor 0.353974
Total 51.209120
=================================================
SPARSE_SCHUR (CPU) + SUITE_SPARSE + AMD
problem-1778-993923-pre.txt
=================================================
Cost:
Initial 2.563973e+08
Final 1.651617e+06
Change 2.547457e+08
Minimizer iterations 11
Successful steps 11
Unsuccessful steps 0
Time (in seconds):
Preprocessor 35.812003
Residual only evaluation 1.658980 (10)
Jacobian & residual evaluation 12.218799 (11)
Linear solver 76.409992 (10)
Minimizer 98.809773
Postprocessor 0.372712
Total 134.994489
=================================================
ITERATIVE_SCHUR (CPU) + JACOBI Preconditioner
problem-1778-993923-pre.txt
=================================================
Cost:
Initial 2.563973e+08
Final 1.684447e+06
Change 2.547128e+08
Minimizer iterations 11
Successful steps 8
Unsuccessful steps 3
Time (in seconds):
Preprocessor 15.331614
Residual only evaluation 1.606114 (10)
Jacobian & residual evaluation 8.502166 (8)
Linear solver 351.910080 (10)
Minimizer 368.797327
Postprocessor 0.363536
Total 384.492478
=================================================
CGNR + CUDA_SPARSE + IDENTITY Preconditioner
problem-13682-4456117-pre.txt
=================================================
Cost:
Initial 1.126372e+09
Final 2.269329e+07
Change 1.103678e+09
Minimizer iterations 11
Successful steps 7
Unsuccessful steps 4
Time (in seconds):
Preprocessor 19.140087
Residual only evaluation 8.721920 (10)
Jacobian & residual evaluation 41.955923 (7)
Linear solver 214.121861 (10)
Minimizer 296.636890
Postprocessor 1.971827
Total 317.748804
Change-Id: I3a09f31aa6903f661e91f595afd39d427583e856
diff --git a/examples/bundle_adjuster.cc b/examples/bundle_adjuster.cc
index 27ea80e..b05c3a2 100644
--- a/examples/bundle_adjuster.cc
+++ b/examples/bundle_adjuster.cc
@@ -81,8 +81,9 @@
"automatic, cameras, points, cameras,points, points,cameras");
DEFINE_string(linear_solver, "sparse_schur", "Options are: "
- "sparse_schur, dense_schur, iterative_schur, sparse_normal_cholesky, "
- "dense_qr, dense_normal_cholesky and cgnr.");
+ "sparse_schur, dense_schur, iterative_schur, "
+ "sparse_normal_cholesky, dense_qr, dense_normal_cholesky, "
+ "and cgnr.");
DEFINE_bool(explicit_schur_complement, false, "If using ITERATIVE_SCHUR "
"then explicitly compute the Schur complement.");
DEFINE_string(preconditioner, "jacobi", "Options are: "
@@ -94,11 +95,13 @@
"Use power series expansion to initialize the solution in ITERATIVE_SCHUR linear solver.");
DEFINE_string(sparse_linear_algebra_library, "suite_sparse",
- "Options are: suite_sparse, cx_sparse, accelerate_sparse and eigen_sparse.");
+ "Options are: suite_sparse, accelerate_sparse, eigen_sparse, and "
+ "cuda_sparse.");
DEFINE_string(dense_linear_algebra_library, "eigen",
"Options are: eigen, lapack, and cuda");
DEFINE_string(ordering_type, "amd", "Options are: amd, nesdis");
-DEFINE_string(linear_solver_ordering, "automatic", "Options are: automatic and user");
+DEFINE_string(linear_solver_ordering, "automatic",
+ "Options are: automatic and user");
DEFINE_bool(use_quaternions, false, "If true, uses quaternions to represent "
"rotations. If false, angle axis is used.");
@@ -351,6 +354,7 @@
SetSolverOptionsFromFlags(&bal_problem, &options);
options.gradient_tolerance = 1e-16;
options.function_tolerance = 1e-16;
+ options.parameter_tolerance = 1e-16;
Solver::Summary summary;
Solve(options, &problem, &summary);
std::cout << summary.FullReport() << "\n";
diff --git a/include/ceres/types.h b/include/ceres/types.h
index 1b07ab3..0a110e2 100644
--- a/include/ceres/types.h
+++ b/include/ceres/types.h
@@ -137,7 +137,7 @@
// well the matrix Q approximates J'J, or how well the chosen
// residual blocks approximate the non-linear least squares
// problem.
- SUBSET,
+ SUBSET
};
enum VisibilityClusteringType {
@@ -175,6 +175,9 @@
// Apple's Accelerate framework sparse linear algebra routines.
ACCELERATE_SPARSE,
+ // Nvidia's cuSPARSE library.
+ CUDA_SPARSE,
+
// No sparse linear solver should be used. This does not necessarily
// imply that Ceres was built without any sparse library, although that
// is the likely use case, merely that one should not be used.
diff --git a/internal/ceres/cgnr_solver.cc b/internal/ceres/cgnr_solver.cc
index 99d5375..f4f9e0f 100644
--- a/internal/ceres/cgnr_solver.cc
+++ b/internal/ceres/cgnr_solver.cc
@@ -35,12 +35,15 @@
#include "ceres/block_jacobi_preconditioner.h"
#include "ceres/conjugate_gradients_solver.h"
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/cuda_vector.h"
#include "ceres/internal/eigen.h"
#include "ceres/linear_solver.h"
#include "ceres/subset_preconditioner.h"
#include "ceres/wall_time.h"
#include "glog/logging.h"
+
namespace ceres::internal {
// A linear operator which takes a matrix A and a diagonal vector D and
@@ -117,7 +120,14 @@
}
}
-CgnrSolver::~CgnrSolver() = default;
+CgnrSolver::~CgnrSolver() {
+ for (int i = 0; i < 4; ++i) {
+ if (scratch_[i]) {
+ delete scratch_[i];
+ scratch_[i] = nullptr;
+ }
+ }
+}
LinearSolver::Summary CgnrSolver::SolveImpl(
BlockSparseMatrix* A,
@@ -163,10 +173,13 @@
cg_solution_ = Vector::Zero(A->num_cols());
for (int i = 0; i < 4; ++i) {
- scratch_[i] = Vector::Zero(A->num_cols());
+ if (scratch_[i] == nullptr) {
+ scratch_[i] = new Vector(A->num_cols());
+ }
}
event_logger.AddEvent("Setup");
+
LinearOperatorAdapter preconditioner(*preconditioner_);
auto summary = ConjugateGradientsSolver(
cg_options, lhs, rhs, preconditioner, scratch_, cg_solution_);
@@ -175,4 +188,135 @@
return summary;
}
+#ifndef CERES_NO_CUDA
+
+// A linear operator which takes a matrix A and a diagonal vector D and
+// performs products of the form
+//
+// (A^T A + D^T D)x
+//
+// This is used to implement iterative general sparse linear solving with
+// conjugate gradients, where A is the Jacobian and D is a regularizing
+// parameter. A brief proof is included in cgnr_linear_operator.h.
+class CERES_NO_EXPORT CudaCgnrLinearOperator final : public
+ ConjugateGradientsLinearOperator<CudaVector> {
+ public:
+ CudaCgnrLinearOperator(CudaSparseMatrix& A,
+ const CudaVector& D,
+ CudaVector* z) : A_(A), D_(D), z_(z) {}
+
+ void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector& y) final {
+ // z = Ax
+ z_->SetZero();
+ A_.RightMultiplyAndAccumulate(x, z_);
+
+ // y = y + Atz
+ // = y + AtAx
+ A_.LeftMultiplyAndAccumulate(*z_, &y);
+
+ // y = y + DtDx
+ y.DtDxpy(D_, x);
+ }
+
+ private:
+ CudaSparseMatrix& A_;
+ const CudaVector& D_;
+ CudaVector* z_ = nullptr;
+};
+
+class CERES_NO_EXPORT CudaIdentityPreconditioner final : public
+ ConjugateGradientsLinearOperator<CudaVector> {
+ public:
+ void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector& y) final {
+ y.Axpby(1.0, x, 1.0);
+ }
+};
+
+CudaCgnrSolver::CudaCgnrSolver(LinearSolver::Options options) :
+ options_(std::move(options)) {}
+
+CudaCgnrSolver::~CudaCgnrSolver() {
+ for (int i = 0; i < 4; ++i) {
+ if (scratch_[i]) {
+ delete scratch_[i];
+ scratch_[i] = nullptr;
+ }
+ }
+}
+
+std::unique_ptr<CudaCgnrSolver> CudaCgnrSolver::Create(
+ LinearSolver::Options options, std::string* error) {
+ CHECK(error != nullptr);
+ if (options.preconditioner_type != IDENTITY) {
+ *error = "CudaCgnrSolver does not support preconditioner type " +
+ std::string(PreconditionerTypeToString(options.preconditioner_type)) + ". ";
+ return nullptr;
+ }
+ CHECK(options.context->IsCUDAInitialized())
+ << "CudaCgnrSolver requires CUDA initialization.";
+ auto solver = std::make_unique<CudaCgnrSolver>(options);
+ return solver;
+}
+
+void CudaCgnrSolver::CpuToGpuTransfer(
+ const CompressedRowSparseMatrix& A, const double* b, const double* D) {
+ if (A_ == nullptr) {
+ // Assume structure is not cached, do an initialization and structural copy.
+ A_ = std::make_unique<CudaSparseMatrix>(options_.context, A);
+ b_ = std::make_unique<CudaVector>(options_.context, A.num_rows());
+ x_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
+ Atb_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
+ Ax_ = std::make_unique<CudaVector>(options_.context, A.num_rows());
+ D_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
+ for (int i = 0; i < 4; ++i) {
+ scratch_[i] = new CudaVector(options_.context, A.num_cols());
+ }
+ } else {
+ // Assume structure is cached, do a value copy.
+ A_->CopyValuesFromCpu(A);
+ }
+ b_->CopyFromCpu(ConstVectorRef(b, A.num_rows()));
+ D_->CopyFromCpu(ConstVectorRef(D, A.num_cols()));
+}
+
+LinearSolver::Summary CudaCgnrSolver::SolveImpl(
+ CompressedRowSparseMatrix* A,
+ const double* b,
+ const LinearSolver::PerSolveOptions& per_solve_options,
+ double* x) {
+ EventLogger event_logger("CudaCgnrSolver::Solve");
+ LinearSolver::Summary summary;
+ summary.num_iterations = 0;
+ summary.termination_type = LinearSolverTerminationType::FATAL_ERROR;
+
+ CpuToGpuTransfer(*A, b, per_solve_options.D);
+ event_logger.AddEvent("CPU to GPU Transfer");
+
+ // Form z = Atb.
+ Atb_->SetZero();
+ A_->LeftMultiplyAndAccumulate(*b_, Atb_.get());
+
+ // Solve (AtA + DtD)x = z (= Atb).
+ x_->SetZero();
+ CudaCgnrLinearOperator lhs(*A_, *D_, Ax_.get());
+
+ event_logger.AddEvent("Setup");
+
+ ConjugateGradientsSolverOptions cg_options;
+ cg_options.min_num_iterations = options_.min_num_iterations;
+ cg_options.max_num_iterations = options_.max_num_iterations;
+ cg_options.residual_reset_period = options_.residual_reset_period;
+ cg_options.q_tolerance = per_solve_options.q_tolerance;
+ cg_options.r_tolerance = per_solve_options.r_tolerance;
+
+ CudaIdentityPreconditioner preconditioner;
+ summary = ConjugateGradientsSolver(
+ cg_options, lhs, *Atb_, preconditioner, scratch_, *x_);
+ x_->CopyTo(x);
+ event_logger.AddEvent("Solve");
+ return summary;
+}
+
+#endif // CERES_NO_CUDA
+
} // namespace ceres::internal
diff --git a/internal/ceres/cgnr_solver.h b/internal/ceres/cgnr_solver.h
index 6982296..a971c81 100644
--- a/internal/ceres/cgnr_solver.h
+++ b/internal/ceres/cgnr_solver.h
@@ -33,6 +33,8 @@
#include <memory>
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/cuda_vector.h"
#include "ceres/internal/export.h"
#include "ceres/linear_solver.h"
@@ -65,9 +67,40 @@
const LinearSolver::Options options_;
std::unique_ptr<Preconditioner> preconditioner_;
Vector cg_solution_;
- Vector scratch_[4];
+ Vector* scratch_[4] = {nullptr, nullptr, nullptr, nullptr};
};
+#ifndef CERES_NO_CUDA
+// A Cuda-accelerated version of CgnrSolver.
+// This solver assumes that the sparsity structure of A remains constant for its
+// lifetime.
+class CERES_NO_EXPORT CudaCgnrSolver final : public CompressedRowSparseMatrixSolver {
+ public:
+ explicit CudaCgnrSolver(LinearSolver::Options options);
+ static std::unique_ptr<CudaCgnrSolver> Create(
+ LinearSolver::Options options, std::string* error);
+ ~CudaCgnrSolver() override;
+
+ Summary SolveImpl(CompressedRowSparseMatrix* A,
+ const double* b,
+ const LinearSolver::PerSolveOptions& per_solve_options,
+ double* x) final;
+
+ private:
+ void CpuToGpuTransfer(
+ const CompressedRowSparseMatrix& A, const double* b, const double* D);
+
+ LinearSolver::Options options_;
+ std::unique_ptr<CudaSparseMatrix> A_;
+ std::unique_ptr<CudaVector> b_;
+ std::unique_ptr<CudaVector> x_;
+ std::unique_ptr<CudaVector> Atb_;
+ std::unique_ptr<CudaVector> Ax_;
+ std::unique_ptr<CudaVector> D_;
+ CudaVector* scratch_[4] = {nullptr, nullptr, nullptr, nullptr};
+};
+#endif // CERES_NO_CUDA
+
} // namespace ceres::internal
#endif // CERES_INTERNAL_CGNR_SOLVER_H_
diff --git a/internal/ceres/conjugate_gradients_solver.h b/internal/ceres/conjugate_gradients_solver.h
index f2eea87..0c3581b 100644
--- a/internal/ceres/conjugate_gradients_solver.h
+++ b/internal/ceres/conjugate_gradients_solver.h
@@ -100,25 +100,25 @@
// allows us to have a single implementation that works on CPU and GPU based
// matrices and vectors.
//
-// scratch must contain four DenseVector objects of the same size as rhs and
-// solution. By asking the user for scratch space, we guarantee that we will not
-// perform any allocations inside this function.
+// scratch must contain pointers to four DenseVector objects of the same size as
+// rhs and solution. By asking the user for scratch space, we guarantee that we
+// will not perform any allocations inside this function.
template <typename DenseVectorType>
LinearSolver::Summary ConjugateGradientsSolver(
const ConjugateGradientsSolverOptions options,
ConjugateGradientsLinearOperator<DenseVectorType>& lhs,
const DenseVectorType& rhs,
ConjugateGradientsLinearOperator<DenseVectorType>& preconditioner,
- DenseVectorType scratch[4],
+ DenseVectorType* scratch[4],
DenseVectorType& solution) {
auto IsZeroOrInfinity = [](double x) {
return ((x == 0.0) || std::isinf(x));
};
- DenseVectorType& p = scratch[0];
- DenseVectorType& r = scratch[1];
- DenseVectorType& z = scratch[2];
- DenseVectorType& tmp = scratch[3];
+ DenseVectorType& p = *scratch[0];
+ DenseVectorType& r = *scratch[1];
+ DenseVectorType& z = *scratch[2];
+ DenseVectorType& tmp = *scratch[3];
LinearSolver::Summary summary;
summary.termination_type = LinearSolverTerminationType::NO_CONVERGENCE;
@@ -160,7 +160,7 @@
SetZero(z);
preconditioner.RightMultiplyAndAccumulate(r, z);
- double last_rho = rho;
+ const double last_rho = rho;
// rho = r.dot(z);
rho = Dot(r, z);
if (IsZeroOrInfinity(rho)) {
@@ -172,7 +172,7 @@
if (summary.num_iterations == 1) {
Copy(z, p);
} else {
- double beta = rho / last_rho;
+ const double beta = rho / last_rho;
if (IsZeroOrInfinity(beta)) {
summary.termination_type = LinearSolverTerminationType::FAILURE;
summary.message = StringPrintf(
diff --git a/internal/ceres/conjugate_gradients_solver_test.cc b/internal/ceres/conjugate_gradients_solver_test.cc
index a01dfc9..e59eba7 100644
--- a/internal/ceres/conjugate_gradients_solver_test.cc
+++ b/internal/ceres/conjugate_gradients_solver_test.cc
@@ -74,9 +74,10 @@
IdentityPreconditioner identity(A->num_cols());
LinearOperatorAdapter lhs(*A);
LinearOperatorAdapter preconditioner(identity);
-
- auto summary =
- ConjugateGradientsSolver(cg_options, lhs, b, preconditioner, scratch, x);
+ Vector* scratch_array[4] =
+ {&scratch[0], &scratch[1], &scratch[2], &scratch[3]};
+ auto summary = ConjugateGradientsSolver(
+ cg_options, lhs, b, preconditioner, scratch_array, x);
EXPECT_EQ(summary.termination_type, LinearSolverTerminationType::SUCCESS);
ASSERT_EQ(summary.num_iterations, 1);
@@ -135,13 +136,14 @@
for (int i = 0; i < 4; ++i) {
scratch[i] = Vector::Zero(A->num_cols());
}
-
+ Vector* scratch_array[4] =
+ {&scratch[0], &scratch[1], &scratch[2], &scratch[3]};
IdentityPreconditioner identity(A->num_cols());
LinearOperatorAdapter lhs(*A);
LinearOperatorAdapter preconditioner(identity);
- auto summary =
- ConjugateGradientsSolver(cg_options, lhs, b, preconditioner, scratch, x);
+ auto summary = ConjugateGradientsSolver(
+ cg_options, lhs, b, preconditioner, scratch_array, x);
EXPECT_EQ(summary.termination_type, LinearSolverTerminationType::SUCCESS);
diff --git a/internal/ceres/cuda_dense_cholesky_test.cc b/internal/ceres/cuda_dense_cholesky_test.cc
index c7b11ce..1483923 100644
--- a/internal/ceres/cuda_dense_cholesky_test.cc
+++ b/internal/ceres/cuda_dense_cholesky_test.cc
@@ -36,8 +36,7 @@
#include "glog/logging.h"
#include "gtest/gtest.h"
-namespace ceres {
-namespace internal {
+namespace ceres::internal {
#ifndef CERES_NO_CUDA
@@ -45,6 +44,8 @@
LinearSolver::Options options;
ContextImpl context;
options.context = &context;
+ std::string error;
+ EXPECT_TRUE(context.InitCUDA(&error)) << error;
auto dense_cuda_solver = CUDADenseCholesky::Create(options);
EXPECT_EQ(dense_cuda_solver, nullptr);
}
@@ -63,6 +64,8 @@
LinearSolver::Options options;
ContextImpl context;
options.context = &context;
+ std::string error;
+ EXPECT_TRUE(context.InitCUDA(&error)) << error;
options.dense_linear_algebra_library_type = CUDA;
auto dense_cuda_solver = CUDADenseCholesky::Create(options);
ASSERT_NE(dense_cuda_solver, nullptr);
@@ -90,6 +93,8 @@
LinearSolver::Options options;
ContextImpl context;
options.context = &context;
+ std::string error;
+ EXPECT_TRUE(context.InitCUDA(&error)) << error;
options.dense_linear_algebra_library_type = CUDA;
auto dense_cuda_solver = CUDADenseCholesky::Create(options);
ASSERT_NE(dense_cuda_solver, nullptr);
@@ -110,6 +115,8 @@
LinearSolver::Options options;
ContextImpl context;
options.context = &context;
+ std::string error;
+ EXPECT_TRUE(context.InitCUDA(&error)) << error;
options.dense_linear_algebra_library_type = CUDA;
auto dense_cuda_solver = CUDADenseCholesky::Create(options);
ASSERT_NE(dense_cuda_solver, nullptr);
@@ -123,6 +130,8 @@
LinearSolver::Options options;
ContextImpl context;
options.context = &context;
+ std::string error;
+ EXPECT_TRUE(context.InitCUDA(&error)) << error;
options.dense_linear_algebra_library_type = CUDA;
auto dense_cuda_solver = CUDADenseCholesky::Create(options);
ASSERT_NE(dense_cuda_solver, nullptr);
@@ -140,6 +149,8 @@
LinearSolver::Options options;
ContextImpl context;
options.context = &context;
+ std::string error;
+ EXPECT_TRUE(context.InitCUDA(&error)) << error;
options.dense_linear_algebra_library_type = ceres::CUDA;
std::unique_ptr<DenseCholesky> dense_cholesky =
CUDADenseCholesky::Create(options);
@@ -175,12 +186,20 @@
{
// Did not ask for CUDA, and did not ask for mixed precision.
LinearSolver::Options options;
+ ContextImpl context;
+ options.context = &context;
+ std::string error;
+ EXPECT_TRUE(context.InitCUDA(&error)) << error;
auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
ASSERT_EQ(solver, nullptr);
}
{
// Asked for CUDA, but did not ask for mixed precision.
LinearSolver::Options options;
+ ContextImpl context;
+ options.context = &context;
+ std::string error;
+ EXPECT_TRUE(context.InitCUDA(&error)) << error;
options.dense_linear_algebra_library_type = ceres::CUDA;
auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
ASSERT_EQ(solver, nullptr);
@@ -204,6 +223,8 @@
options.max_num_refinement_iterations = 0;
ContextImpl context;
options.context = &context;
+ std::string error;
+ EXPECT_TRUE(context.InitCUDA(&error)) << error;
options.dense_linear_algebra_library_type = CUDA;
options.use_mixed_precision_solves = true;
auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
@@ -239,6 +260,8 @@
options.max_num_refinement_iterations = 3;
ContextImpl context;
options.context = &context;
+ std::string error;
+ EXPECT_TRUE(context.InitCUDA(&error)) << error;
options.dense_linear_algebra_library_type = CUDA;
options.use_mixed_precision_solves = true;
auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
@@ -267,6 +290,8 @@
LinearSolver::Options options;
ContextImpl context;
options.context = &context;
+ std::string error;
+ EXPECT_TRUE(context.InitCUDA(&error)) << error;
options.dense_linear_algebra_library_type = ceres::CUDA;
options.use_mixed_precision_solves = true;
options.max_num_refinement_iterations = 20;
@@ -302,5 +327,4 @@
#endif // CERES_NO_CUDA
-} // namespace internal
-} // namespace ceres
+} // namespace ceres::internal
diff --git a/internal/ceres/cuda_dense_qr_test.cc b/internal/ceres/cuda_dense_qr_test.cc
index 9eb5d4d..cc0dbd6 100644
--- a/internal/ceres/cuda_dense_qr_test.cc
+++ b/internal/ceres/cuda_dense_qr_test.cc
@@ -41,6 +41,10 @@
TEST(CUDADenseQR, InvalidOptionOnCreate) {
LinearSolver::Options options;
+ ContextImpl context;
+ options.context = &context;
+ std::string error;
+ EXPECT_TRUE(context.InitCUDA(&error)) << error;
auto dense_cuda_solver = CUDADenseQR::Create(options);
EXPECT_EQ(dense_cuda_solver, nullptr);
}
@@ -58,6 +62,8 @@
LinearSolver::Options options;
ContextImpl context;
options.context = &context;
+ std::string error;
+ EXPECT_TRUE(context.InitCUDA(&error)) << error;
options.dense_linear_algebra_library_type = CUDA;
auto dense_cuda_solver = CUDADenseQR::Create(options);
ASSERT_NE(dense_cuda_solver, nullptr);
@@ -90,6 +96,8 @@
LinearSolver::Options options;
ContextImpl context;
options.context = &context;
+ std::string error;
+ EXPECT_TRUE(context.InitCUDA(&error)) << error;
options.dense_linear_algebra_library_type = CUDA;
auto dense_cuda_solver = CUDADenseQR::Create(options);
ASSERT_NE(dense_cuda_solver, nullptr);
@@ -112,6 +120,8 @@
LinearSolver::Options options;
ContextImpl context;
options.context = &context;
+ std::string error;
+ EXPECT_TRUE(context.InitCUDA(&error)) << error;
options.dense_linear_algebra_library_type = CUDA;
auto dense_cuda_solver = CUDADenseQR::Create(options);
ASSERT_NE(dense_cuda_solver, nullptr);
@@ -130,10 +140,12 @@
LinearSolver::Options options;
ContextImpl context;
options.context = &context;
+ std::string error;
+ EXPECT_TRUE(context.InitCUDA(&error)) << error;
options.dense_linear_algebra_library_type = ceres::CUDA;
std::unique_ptr<DenseQR> dense_qr = CUDADenseQR::Create(options);
- const int kNumTrials = 100;
+ const int kNumTrials = 20;
for (int i = 0; i < kNumTrials; ++i) {
LhsType lhs = LhsType::Random(kNumRows, kNumCols);
SolutionType x_expected = SolutionType::Random(kNumCols);
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index 244de74..af42e73 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc
@@ -348,9 +348,8 @@
#ifndef CERES_NO_CUDA
bool CUDADenseCholesky::Init(ContextImpl* context, std::string* message) {
- if (!context->InitCUDA(message)) {
- return false;
- }
+ CHECK(context->IsCUDAInitialized())
+ << "CUDADenseCholesky requires CUDA initialization.";
cusolver_handle_ = context->cusolver_handle_;
stream_ = context->stream_;
error_.Reserve(1);
@@ -491,9 +490,8 @@
bool CUDADenseCholeskyMixedPrecision::Init(const LinearSolver::Options& options,
std::string* message) {
- if (!options.context->InitCUDA(message)) {
- return false;
- }
+ CHECK(options.context->IsCUDAInitialized())
+ << "CUDADenseCholeskyMixedPrecision requires CUDA initialization.";
cusolver_handle_ = options.context->cusolver_handle_;
cublas_handle_ = options.context->cublas_handle_;
stream_ = options.context->stream_;
diff --git a/internal/ceres/dense_cholesky_test.cc b/internal/ceres/dense_cholesky_test.cc
index 1c0df56..34ff4f3 100644
--- a/internal/ceres/dense_cholesky_test.cc
+++ b/internal/ceres/dense_cholesky_test.cc
@@ -76,6 +76,8 @@
LinearSolver::Options options;
ContextImpl context;
options.context = &context;
+ std::string error;
+ CHECK(context.InitCUDA(&error)) << error;
options.dense_linear_algebra_library_type = ::testing::get<0>(GetParam());
options.use_mixed_precision_solves = ::testing::get<1>(GetParam());
const int kNumRefinementSteps = 4;
diff --git a/internal/ceres/dense_qr_test.cc b/internal/ceres/dense_qr_test.cc
index acef750..f562d97 100644
--- a/internal/ceres/dense_qr_test.cc
+++ b/internal/ceres/dense_qr_test.cc
@@ -68,6 +68,8 @@
LinearSolver::Options options;
ContextImpl context;
options.context = &context;
+ std::string error;
+ CHECK(context.InitCUDA(&error)) << error;
options.dense_linear_algebra_library_type = GetParam();
const double kEpsilon = std::numeric_limits<double>::epsilon() * 1.5e4;
std::unique_ptr<DenseQR> dense_qr = DenseQR::Create(options);
diff --git a/internal/ceres/evaluator.cc b/internal/ceres/evaluator.cc
index f44796a..91cf221 100644
--- a/internal/ceres/evaluator.cc
+++ b/internal/ceres/evaluator.cc
@@ -64,10 +64,17 @@
case DENSE_SCHUR:
case SPARSE_SCHUR:
case ITERATIVE_SCHUR:
- case CGNR:
- return std::make_unique<
- ProgramEvaluator<BlockEvaluatePreparer, BlockJacobianWriter>>(
- options, program);
+ case CGNR: {
+ if (options.sparse_linear_algebra_library_type == CUDA_SPARSE) {
+ return std::make_unique<
+ ProgramEvaluator<ScratchEvaluatePreparer,
+ CompressedRowJacobianWriter>>(options, program);
+ } else {
+ return std::make_unique<
+ ProgramEvaluator<BlockEvaluatePreparer, BlockJacobianWriter>>(
+ options, program);
+ }
+ }
case SPARSE_NORMAL_CHOLESKY:
if (options.dynamic_sparsity) {
return std::make_unique<
diff --git a/internal/ceres/evaluator.h b/internal/ceres/evaluator.h
index 68a4fb2..7ea523c 100644
--- a/internal/ceres/evaluator.h
+++ b/internal/ceres/evaluator.h
@@ -65,6 +65,7 @@
int num_threads = 1;
int num_eliminate_blocks = -1;
LinearSolverType linear_solver_type = DENSE_QR;
+ SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type = NO_SPARSE;
bool dynamic_sparsity = false;
ContextImpl* context = nullptr;
EvaluationCallback* evaluation_callback = nullptr;
diff --git a/internal/ceres/iterative_schur_complement_solver.cc b/internal/ceres/iterative_schur_complement_solver.cc
index 607502e..0463afb 100644
--- a/internal/ceres/iterative_schur_complement_solver.cc
+++ b/internal/ceres/iterative_schur_complement_solver.cc
@@ -129,6 +129,9 @@
for (int i = 0; i < 4; ++i) {
scratch[i] = Vector::Zero(schur_complement_->num_cols());
}
+ Vector* scratch_ptr[4] = {
+ &scratch[0], &scratch[1], &scratch[2], &scratch[3]
+ };
event_logger.AddEvent("Setup");
@@ -137,7 +140,7 @@
lhs,
schur_complement_->rhs(),
preconditioner,
- scratch,
+ scratch_ptr,
reduced_linear_system_solution_);
if (summary.termination_type != LinearSolverTerminationType::FAILURE &&
diff --git a/internal/ceres/linear_solver.cc b/internal/ceres/linear_solver.cc
index d9ef150..2e3fe4b 100644
--- a/internal/ceres/linear_solver.cc
+++ b/internal/ceres/linear_solver.cc
@@ -76,8 +76,15 @@
CHECK(options.context != nullptr);
switch (options.type) {
- case CGNR:
+ case CGNR: {
+#ifndef CERES_NO_CUDA
+ if (options.sparse_linear_algebra_library_type == CUDA_SPARSE) {
+ std::string error;
+ return CudaCgnrSolver::Create(options, &error);
+ }
+#endif
return std::make_unique<CgnrSolver>(options);
+ } break;
case SPARSE_NORMAL_CHOLESKY:
#if defined(CERES_NO_SPARSE)
diff --git a/internal/ceres/schur_complement_solver.cc b/internal/ceres/schur_complement_solver.cc
index ba96e1c..631862c 100644
--- a/internal/ceres/schur_complement_solver.cc
+++ b/internal/ceres/schur_complement_solver.cc
@@ -217,7 +217,14 @@
}
}
-SparseSchurComplementSolver::~SparseSchurComplementSolver() = default;
+SparseSchurComplementSolver::~SparseSchurComplementSolver() {
+ for (int i = 0; i < 4; ++i) {
+ if (scratch_[i]) {
+ delete scratch_[i];
+ scratch_[i] = nullptr;
+ }
+ }
+}
// Determine the non-zero blocks in the Schur Complement matrix, and
// initialize a BlockRandomAccessSparseMatrix object.
@@ -394,11 +401,11 @@
cg_options.r_tolerance = per_solve_options.r_tolerance;
cg_solution_ = Vector::Zero(sc->num_rows());
- Vector scratch[4];
for (int i = 0; i < 4; ++i) {
- scratch_[i] = Vector::Zero(sc->num_rows());
+ if (scratch_[i] == nullptr) {
+ scratch_[i] = new Vector(sc->num_rows());
+ }
}
-
auto summary = ConjugateGradientsSolver<Vector>(
cg_options, *lhs, rhs(), *preconditioner, scratch_, cg_solution_);
VectorRef(solution, sc->num_rows()) = cg_solution_;
diff --git a/internal/ceres/schur_complement_solver.h b/internal/ceres/schur_complement_solver.h
index 4fde69c..cac96b5 100644
--- a/internal/ceres/schur_complement_solver.h
+++ b/internal/ceres/schur_complement_solver.h
@@ -188,7 +188,7 @@
std::unique_ptr<SparseCholesky> sparse_cholesky_;
std::unique_ptr<BlockRandomAccessDiagonalMatrix> preconditioner_;
Vector cg_solution_;
- Vector scratch_[4];
+ Vector* scratch_[4] = {nullptr, nullptr, nullptr, nullptr};
};
} // namespace ceres::internal
diff --git a/internal/ceres/solver.cc b/internal/ceres/solver.cc
index b772f3f..0e0790e 100644
--- a/internal/ceres/solver.cc
+++ b/internal/ceres/solver.cc
@@ -327,7 +327,8 @@
return false;
}
- if (options.preconditioner_type == SUBSET) {
+ if (options.sparse_linear_algebra_library_type != CUDA_SPARSE &&
+ options.preconditioner_type == SUBSET) {
if (options.residual_blocks_for_subset_preconditioner.empty()) {
*error =
"When using SUBSET preconditioner, "
@@ -341,6 +342,20 @@
}
}
+ // Check options for CGNR with CUDA_SPARSE.
+ if (options.sparse_linear_algebra_library_type == CUDA_SPARSE) {
+ if (!IsSparseLinearAlgebraLibraryTypeAvailable(CUDA_SPARSE)) {
+ *error = "Can't use CGNR with sparse_linear_algebra_library_type = "
+ "CUDA_SPARSE because support was not enabled when Ceres was built.";
+ return false;
+ }
+ if (options.preconditioner_type != IDENTITY) {
+ StringPrintf("Can't use CGNR with preconditioner_type = %s when "
+ "sparse_linear_algebra_library_type = CUDA_SPARSE.",
+ PreconditionerTypeToString(options.preconditioner_type));
+ return false;
+ }
+ }
return true;
}
@@ -652,6 +667,18 @@
return internal::StringPrintf("%s,%s,%s", row.c_str(), e.c_str(), f.c_str());
}
+bool IsCudaRequired(const Solver::Options& options) {
+ if (options.linear_solver_type == DENSE_NORMAL_CHOLESKY ||
+ options.linear_solver_type == DENSE_SCHUR ||
+ options.linear_solver_type == DENSE_QR) {
+ return (options.dense_linear_algebra_library_type == CUDA);
+ }
+ if (options.linear_solver_type == CGNR) {
+ return (options.sparse_linear_algebra_library_type == CUDA_SPARSE);
+ }
+ return false;
+}
+
} // namespace
bool Solver::Options::IsValid(string* error) const {
@@ -697,6 +724,13 @@
Program* program = problem_impl->mutable_program();
PreSolveSummarize(options, problem_impl, summary);
+ if (IsCudaRequired(options)) {
+ if (!problem_impl->context()->InitCUDA(&summary->message)) {
+ LOG(ERROR) << "Terminating: " << summary->message;
+ return;
+ }
+ }
+
// If gradient_checking is enabled, wrap all cost functions in a
// gradient checker and install a callback that terminates if any gradient
// error is detected.
@@ -866,22 +900,40 @@
}
}
- if (linear_solver_type_used == SPARSE_NORMAL_CHOLESKY ||
+ const bool used_sparse_linear_algebra_library =
+ linear_solver_type_used == SPARSE_NORMAL_CHOLESKY ||
linear_solver_type_used == SPARSE_SCHUR ||
- (linear_solver_type_used == CGNR &&
- preconditioner_type_used == SUBSET) ||
+ linear_solver_type_used == CGNR ||
(linear_solver_type_used == ITERATIVE_SCHUR &&
- (preconditioner_type_used == CLUSTER_JACOBI ||
- preconditioner_type_used == CLUSTER_TRIDIAGONAL))) {
+ (preconditioner_type_used == CLUSTER_JACOBI ||
+ preconditioner_type_used == CLUSTER_TRIDIAGONAL));
+
+ const bool linear_solver_ordering_required =
+ linear_solver_type_used == SPARSE_SCHUR ||
+ (linear_solver_type_used == ITERATIVE_SCHUR &&
+ (preconditioner_type_used == CLUSTER_JACOBI ||
+ preconditioner_type_used == CLUSTER_TRIDIAGONAL)) ||
+ (linear_solver_type_used == CGNR && preconditioner_type_used == SUBSET);
+
+ if (used_sparse_linear_algebra_library) {
const char* mixed_precision_suffix =
(mixed_precision_solves_used ? "(Mixed Precision)" : "");
- StringAppendF(
- &report,
- "\nSparse linear algebra library %15s + %s %s\n",
- SparseLinearAlgebraLibraryTypeToString(
- sparse_linear_algebra_library_type),
- LinearSolverOrderingTypeToString(linear_solver_ordering_type),
- mixed_precision_suffix);
+ if (linear_solver_ordering_required) {
+ StringAppendF(
+ &report,
+ "\nSparse linear algebra library %15s + %s %s\n",
+ SparseLinearAlgebraLibraryTypeToString(
+ sparse_linear_algebra_library_type),
+ LinearSolverOrderingTypeToString(linear_solver_ordering_type),
+ mixed_precision_suffix);
+ } else {
+ StringAppendF(
+ &report,
+ "\nSparse linear algebra library %15s %s\n",
+ SparseLinearAlgebraLibraryTypeToString(
+ sparse_linear_algebra_library_type),
+ mixed_precision_suffix);
+ }
}
StringAppendF(&report, "\n");
diff --git a/internal/ceres/solver_test.cc b/internal/ceres/solver_test.cc
index b4b2d34..92c1ddc 100644
--- a/internal/ceres/solver_test.cc
+++ b/internal/ceres/solver_test.cc
@@ -878,6 +878,20 @@
options.dynamic_sparsity = false;
options.use_mixed_precision_solves = true;
EXPECT_FALSE(options.IsValid(&message));
+
+ options.sparse_linear_algebra_library_type = CUDA_SPARSE;
+ options.dynamic_sparsity = false;
+ options.use_mixed_precision_solves = false;
+ EXPECT_EQ(options.IsValid(&message),
+ IsSparseLinearAlgebraLibraryTypeAvailable(CUDA_SPARSE));
+
+ options.dynamic_sparsity = true;
+ options.use_mixed_precision_solves = false;
+ EXPECT_FALSE(options.IsValid(&message));
+
+ options.dynamic_sparsity = false;
+ options.use_mixed_precision_solves = true;
+ EXPECT_FALSE(options.IsValid(&message));
}
TEST(Solver, CgnrOptionsJacobiPreconditioner) {
@@ -940,6 +954,20 @@
options.dynamic_sparsity = false;
options.use_mixed_precision_solves = true;
EXPECT_FALSE(options.IsValid(&message));
+
+ options.sparse_linear_algebra_library_type = CUDA_SPARSE;
+
+ options.dynamic_sparsity = false;
+ options.use_mixed_precision_solves = false;
+ EXPECT_FALSE(options.IsValid(&message));
+
+ options.dynamic_sparsity = true;
+ options.use_mixed_precision_solves = false;
+ EXPECT_FALSE(options.IsValid(&message));
+
+ options.dynamic_sparsity = false;
+ options.use_mixed_precision_solves = true;
+ EXPECT_FALSE(options.IsValid(&message));
}
TEST(Solver, CgnrOptionsSubsetPreconditioner) {
@@ -1012,6 +1040,19 @@
options.use_mixed_precision_solves = true;
EXPECT_FALSE(options.IsValid(&message));
}
+
+ options.sparse_linear_algebra_library_type = CUDA_SPARSE;
+ options.dynamic_sparsity = false;
+ options.use_mixed_precision_solves = false;
+ EXPECT_FALSE(options.IsValid(&message));
+
+ options.dynamic_sparsity = true;
+ options.use_mixed_precision_solves = false;
+ EXPECT_FALSE(options.IsValid(&message));
+
+ options.dynamic_sparsity = false;
+ options.use_mixed_precision_solves = true;
+ EXPECT_FALSE(options.IsValid(&message));
}
TEST(Solver, CgnrOptionsSchurPreconditioners) {
diff --git a/internal/ceres/trust_region_preprocessor.cc b/internal/ceres/trust_region_preprocessor.cc
index d27a472..84e783f 100644
--- a/internal/ceres/trust_region_preprocessor.cc
+++ b/internal/ceres/trust_region_preprocessor.cc
@@ -261,6 +261,8 @@
const Solver::Options& options = pp->options;
pp->evaluator_options = Evaluator::Options();
pp->evaluator_options.linear_solver_type = options.linear_solver_type;
+ pp->evaluator_options.sparse_linear_algebra_library_type =
+ options.sparse_linear_algebra_library_type;
pp->evaluator_options.num_eliminate_blocks = 0;
if (IsSchurType(options.linear_solver_type)) {
pp->evaluator_options.num_eliminate_blocks =
diff --git a/internal/ceres/types.cc b/internal/ceres/types.cc
index 1d514de..3de91ed 100644
--- a/internal/ceres/types.cc
+++ b/internal/ceres/types.cc
@@ -108,6 +108,7 @@
CASESTR(SUITE_SPARSE);
CASESTR(EIGEN_SPARSE);
CASESTR(ACCELERATE_SPARSE);
+ CASESTR(CUDA_SPARSE);
CASESTR(NO_SPARSE);
default:
return "UNKNOWN";
@@ -120,6 +121,7 @@
STRENUM(SUITE_SPARSE);
STRENUM(EIGEN_SPARSE);
STRENUM(ACCELERATE_SPARSE);
+ STRENUM(CUDA_SPARSE);
STRENUM(NO_SPARSE);
return false;
}
@@ -420,6 +422,14 @@
#endif
}
+ if (type == CUDA_SPARSE) {
+#ifdef CERES_NO_CUDA
+ return false;
+#else
+ return true;
+#endif
+ }
+
if (type == NO_SPARSE) {
return true;
}