CUDA CGNR, Part 4: CudaCgnrSolver * Added CudaCgnrSolver, a new CUDA-accelerated CGNR. * To use CudaCgnrSolver, the user must select CGNR as the linear_solver and CUDA_SPARSE as the sparse_linear_algebra_library. * Updated ConjugateGradientSolver to work with an array of pointers to scratch to support CudaVectors as scratch. * Moved CUDA initialization to run in Solver::Solve as needed. Some performance comparisons on an Ubuntu 20.04 desktop with an Intel i9-9940X CPU @ 3.30GHz, and an nVidia Quadro RTX 6000, all configurations run with 24 threads, and 10 iterations. ================================================= CGNR + CUDA_SPARSE + IDENTITY Preconditioner problem-1778-993923-pre.txt ================================================= Cost: Initial 2.563973e+08 Final 1.724755e+06 Change 2.546725e+08 Minimizer iterations 11 Successful steps 7 Unsuccessful steps 4 Time (in seconds): Preprocessor 4.020158 Residual only evaluation 1.567092 (10) Jacobian & residual evaluation 7.847130 (7) Linear solver 31.688898 (10) Minimizer 46.834987 Postprocessor 0.353974 Total 51.209120 ================================================= SPARSE_SCHUR (CPU) + SUITE_SPARSE + AMD problem-1778-993923-pre.txt ================================================= Cost: Initial 2.563973e+08 Final 1.651617e+06 Change 2.547457e+08 Minimizer iterations 11 Successful steps 11 Unsuccessful steps 0 Time (in seconds): Preprocessor 35.812003 Residual only evaluation 1.658980 (10) Jacobian & residual evaluation 12.218799 (11) Linear solver 76.409992 (10) Minimizer 98.809773 Postprocessor 0.372712 Total 134.994489 ================================================= ITERATIVE_SCHUR (CPU) + JACOBI Preconditioner problem-1778-993923-pre.txt ================================================= Cost: Initial 2.563973e+08 Final 1.684447e+06 Change 2.547128e+08 Minimizer iterations 11 Successful steps 8 Unsuccessful steps 3 Time (in seconds): Preprocessor 15.331614 Residual only evaluation 1.606114 (10) Jacobian & residual evaluation 8.502166 (8) Linear solver 351.910080 (10) Minimizer 368.797327 Postprocessor 0.363536 Total 384.492478 ================================================= CGNR + CUDA_SPARSE + IDENTITY Preconditioner problem-13682-4456117-pre.txt ================================================= Cost: Initial 1.126372e+09 Final 2.269329e+07 Change 1.103678e+09 Minimizer iterations 11 Successful steps 7 Unsuccessful steps 4 Time (in seconds): Preprocessor 19.140087 Residual only evaluation 8.721920 (10) Jacobian & residual evaluation 41.955923 (7) Linear solver 214.121861 (10) Minimizer 296.636890 Postprocessor 1.971827 Total 317.748804 Change-Id: I3a09f31aa6903f661e91f595afd39d427583e856
diff --git a/examples/bundle_adjuster.cc b/examples/bundle_adjuster.cc index 27ea80e..b05c3a2 100644 --- a/examples/bundle_adjuster.cc +++ b/examples/bundle_adjuster.cc
@@ -81,8 +81,9 @@ "automatic, cameras, points, cameras,points, points,cameras"); DEFINE_string(linear_solver, "sparse_schur", "Options are: " - "sparse_schur, dense_schur, iterative_schur, sparse_normal_cholesky, " - "dense_qr, dense_normal_cholesky and cgnr."); + "sparse_schur, dense_schur, iterative_schur, " + "sparse_normal_cholesky, dense_qr, dense_normal_cholesky, " + "and cgnr."); DEFINE_bool(explicit_schur_complement, false, "If using ITERATIVE_SCHUR " "then explicitly compute the Schur complement."); DEFINE_string(preconditioner, "jacobi", "Options are: " @@ -94,11 +95,13 @@ "Use power series expansion to initialize the solution in ITERATIVE_SCHUR linear solver."); DEFINE_string(sparse_linear_algebra_library, "suite_sparse", - "Options are: suite_sparse, cx_sparse, accelerate_sparse and eigen_sparse."); + "Options are: suite_sparse, accelerate_sparse, eigen_sparse, and " + "cuda_sparse."); DEFINE_string(dense_linear_algebra_library, "eigen", "Options are: eigen, lapack, and cuda"); DEFINE_string(ordering_type, "amd", "Options are: amd, nesdis"); -DEFINE_string(linear_solver_ordering, "automatic", "Options are: automatic and user"); +DEFINE_string(linear_solver_ordering, "automatic", + "Options are: automatic and user"); DEFINE_bool(use_quaternions, false, "If true, uses quaternions to represent " "rotations. If false, angle axis is used."); @@ -351,6 +354,7 @@ SetSolverOptionsFromFlags(&bal_problem, &options); options.gradient_tolerance = 1e-16; options.function_tolerance = 1e-16; + options.parameter_tolerance = 1e-16; Solver::Summary summary; Solve(options, &problem, &summary); std::cout << summary.FullReport() << "\n";
diff --git a/include/ceres/types.h b/include/ceres/types.h index 1b07ab3..0a110e2 100644 --- a/include/ceres/types.h +++ b/include/ceres/types.h
@@ -137,7 +137,7 @@ // well the matrix Q approximates J'J, or how well the chosen // residual blocks approximate the non-linear least squares // problem. - SUBSET, + SUBSET }; enum VisibilityClusteringType { @@ -175,6 +175,9 @@ // Apple's Accelerate framework sparse linear algebra routines. ACCELERATE_SPARSE, + // Nvidia's cuSPARSE library. + CUDA_SPARSE, + // No sparse linear solver should be used. This does not necessarily // imply that Ceres was built without any sparse library, although that // is the likely use case, merely that one should not be used.
diff --git a/internal/ceres/cgnr_solver.cc b/internal/ceres/cgnr_solver.cc index 99d5375..f4f9e0f 100644 --- a/internal/ceres/cgnr_solver.cc +++ b/internal/ceres/cgnr_solver.cc
@@ -35,12 +35,15 @@ #include "ceres/block_jacobi_preconditioner.h" #include "ceres/conjugate_gradients_solver.h" +#include "ceres/cuda_sparse_matrix.h" +#include "ceres/cuda_vector.h" #include "ceres/internal/eigen.h" #include "ceres/linear_solver.h" #include "ceres/subset_preconditioner.h" #include "ceres/wall_time.h" #include "glog/logging.h" + namespace ceres::internal { // A linear operator which takes a matrix A and a diagonal vector D and @@ -117,7 +120,14 @@ } } -CgnrSolver::~CgnrSolver() = default; +CgnrSolver::~CgnrSolver() { + for (int i = 0; i < 4; ++i) { + if (scratch_[i]) { + delete scratch_[i]; + scratch_[i] = nullptr; + } + } +} LinearSolver::Summary CgnrSolver::SolveImpl( BlockSparseMatrix* A, @@ -163,10 +173,13 @@ cg_solution_ = Vector::Zero(A->num_cols()); for (int i = 0; i < 4; ++i) { - scratch_[i] = Vector::Zero(A->num_cols()); + if (scratch_[i] == nullptr) { + scratch_[i] = new Vector(A->num_cols()); + } } event_logger.AddEvent("Setup"); + LinearOperatorAdapter preconditioner(*preconditioner_); auto summary = ConjugateGradientsSolver( cg_options, lhs, rhs, preconditioner, scratch_, cg_solution_); @@ -175,4 +188,135 @@ return summary; } +#ifndef CERES_NO_CUDA + +// A linear operator which takes a matrix A and a diagonal vector D and +// performs products of the form +// +// (A^T A + D^T D)x +// +// This is used to implement iterative general sparse linear solving with +// conjugate gradients, where A is the Jacobian and D is a regularizing +// parameter. A brief proof is included in cgnr_linear_operator.h. +class CERES_NO_EXPORT CudaCgnrLinearOperator final : public + ConjugateGradientsLinearOperator<CudaVector> { + public: + CudaCgnrLinearOperator(CudaSparseMatrix& A, + const CudaVector& D, + CudaVector* z) : A_(A), D_(D), z_(z) {} + + void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector& y) final { + // z = Ax + z_->SetZero(); + A_.RightMultiplyAndAccumulate(x, z_); + + // y = y + Atz + // = y + AtAx + A_.LeftMultiplyAndAccumulate(*z_, &y); + + // y = y + DtDx + y.DtDxpy(D_, x); + } + + private: + CudaSparseMatrix& A_; + const CudaVector& D_; + CudaVector* z_ = nullptr; +}; + +class CERES_NO_EXPORT CudaIdentityPreconditioner final : public + ConjugateGradientsLinearOperator<CudaVector> { + public: + void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector& y) final { + y.Axpby(1.0, x, 1.0); + } +}; + +CudaCgnrSolver::CudaCgnrSolver(LinearSolver::Options options) : + options_(std::move(options)) {} + +CudaCgnrSolver::~CudaCgnrSolver() { + for (int i = 0; i < 4; ++i) { + if (scratch_[i]) { + delete scratch_[i]; + scratch_[i] = nullptr; + } + } +} + +std::unique_ptr<CudaCgnrSolver> CudaCgnrSolver::Create( + LinearSolver::Options options, std::string* error) { + CHECK(error != nullptr); + if (options.preconditioner_type != IDENTITY) { + *error = "CudaCgnrSolver does not support preconditioner type " + + std::string(PreconditionerTypeToString(options.preconditioner_type)) + ". "; + return nullptr; + } + CHECK(options.context->IsCUDAInitialized()) + << "CudaCgnrSolver requires CUDA initialization."; + auto solver = std::make_unique<CudaCgnrSolver>(options); + return solver; +} + +void CudaCgnrSolver::CpuToGpuTransfer( + const CompressedRowSparseMatrix& A, const double* b, const double* D) { + if (A_ == nullptr) { + // Assume structure is not cached, do an initialization and structural copy. + A_ = std::make_unique<CudaSparseMatrix>(options_.context, A); + b_ = std::make_unique<CudaVector>(options_.context, A.num_rows()); + x_ = std::make_unique<CudaVector>(options_.context, A.num_cols()); + Atb_ = std::make_unique<CudaVector>(options_.context, A.num_cols()); + Ax_ = std::make_unique<CudaVector>(options_.context, A.num_rows()); + D_ = std::make_unique<CudaVector>(options_.context, A.num_cols()); + for (int i = 0; i < 4; ++i) { + scratch_[i] = new CudaVector(options_.context, A.num_cols()); + } + } else { + // Assume structure is cached, do a value copy. + A_->CopyValuesFromCpu(A); + } + b_->CopyFromCpu(ConstVectorRef(b, A.num_rows())); + D_->CopyFromCpu(ConstVectorRef(D, A.num_cols())); +} + +LinearSolver::Summary CudaCgnrSolver::SolveImpl( + CompressedRowSparseMatrix* A, + const double* b, + const LinearSolver::PerSolveOptions& per_solve_options, + double* x) { + EventLogger event_logger("CudaCgnrSolver::Solve"); + LinearSolver::Summary summary; + summary.num_iterations = 0; + summary.termination_type = LinearSolverTerminationType::FATAL_ERROR; + + CpuToGpuTransfer(*A, b, per_solve_options.D); + event_logger.AddEvent("CPU to GPU Transfer"); + + // Form z = Atb. + Atb_->SetZero(); + A_->LeftMultiplyAndAccumulate(*b_, Atb_.get()); + + // Solve (AtA + DtD)x = z (= Atb). + x_->SetZero(); + CudaCgnrLinearOperator lhs(*A_, *D_, Ax_.get()); + + event_logger.AddEvent("Setup"); + + ConjugateGradientsSolverOptions cg_options; + cg_options.min_num_iterations = options_.min_num_iterations; + cg_options.max_num_iterations = options_.max_num_iterations; + cg_options.residual_reset_period = options_.residual_reset_period; + cg_options.q_tolerance = per_solve_options.q_tolerance; + cg_options.r_tolerance = per_solve_options.r_tolerance; + + CudaIdentityPreconditioner preconditioner; + summary = ConjugateGradientsSolver( + cg_options, lhs, *Atb_, preconditioner, scratch_, *x_); + x_->CopyTo(x); + event_logger.AddEvent("Solve"); + return summary; +} + +#endif // CERES_NO_CUDA + } // namespace ceres::internal
diff --git a/internal/ceres/cgnr_solver.h b/internal/ceres/cgnr_solver.h index 6982296..a971c81 100644 --- a/internal/ceres/cgnr_solver.h +++ b/internal/ceres/cgnr_solver.h
@@ -33,6 +33,8 @@ #include <memory> +#include "ceres/cuda_sparse_matrix.h" +#include "ceres/cuda_vector.h" #include "ceres/internal/export.h" #include "ceres/linear_solver.h" @@ -65,9 +67,40 @@ const LinearSolver::Options options_; std::unique_ptr<Preconditioner> preconditioner_; Vector cg_solution_; - Vector scratch_[4]; + Vector* scratch_[4] = {nullptr, nullptr, nullptr, nullptr}; }; +#ifndef CERES_NO_CUDA +// A Cuda-accelerated version of CgnrSolver. +// This solver assumes that the sparsity structure of A remains constant for its +// lifetime. +class CERES_NO_EXPORT CudaCgnrSolver final : public CompressedRowSparseMatrixSolver { + public: + explicit CudaCgnrSolver(LinearSolver::Options options); + static std::unique_ptr<CudaCgnrSolver> Create( + LinearSolver::Options options, std::string* error); + ~CudaCgnrSolver() override; + + Summary SolveImpl(CompressedRowSparseMatrix* A, + const double* b, + const LinearSolver::PerSolveOptions& per_solve_options, + double* x) final; + + private: + void CpuToGpuTransfer( + const CompressedRowSparseMatrix& A, const double* b, const double* D); + + LinearSolver::Options options_; + std::unique_ptr<CudaSparseMatrix> A_; + std::unique_ptr<CudaVector> b_; + std::unique_ptr<CudaVector> x_; + std::unique_ptr<CudaVector> Atb_; + std::unique_ptr<CudaVector> Ax_; + std::unique_ptr<CudaVector> D_; + CudaVector* scratch_[4] = {nullptr, nullptr, nullptr, nullptr}; +}; +#endif // CERES_NO_CUDA + } // namespace ceres::internal #endif // CERES_INTERNAL_CGNR_SOLVER_H_
diff --git a/internal/ceres/conjugate_gradients_solver.h b/internal/ceres/conjugate_gradients_solver.h index f2eea87..0c3581b 100644 --- a/internal/ceres/conjugate_gradients_solver.h +++ b/internal/ceres/conjugate_gradients_solver.h
@@ -100,25 +100,25 @@ // allows us to have a single implementation that works on CPU and GPU based // matrices and vectors. // -// scratch must contain four DenseVector objects of the same size as rhs and -// solution. By asking the user for scratch space, we guarantee that we will not -// perform any allocations inside this function. +// scratch must contain pointers to four DenseVector objects of the same size as +// rhs and solution. By asking the user for scratch space, we guarantee that we +// will not perform any allocations inside this function. template <typename DenseVectorType> LinearSolver::Summary ConjugateGradientsSolver( const ConjugateGradientsSolverOptions options, ConjugateGradientsLinearOperator<DenseVectorType>& lhs, const DenseVectorType& rhs, ConjugateGradientsLinearOperator<DenseVectorType>& preconditioner, - DenseVectorType scratch[4], + DenseVectorType* scratch[4], DenseVectorType& solution) { auto IsZeroOrInfinity = [](double x) { return ((x == 0.0) || std::isinf(x)); }; - DenseVectorType& p = scratch[0]; - DenseVectorType& r = scratch[1]; - DenseVectorType& z = scratch[2]; - DenseVectorType& tmp = scratch[3]; + DenseVectorType& p = *scratch[0]; + DenseVectorType& r = *scratch[1]; + DenseVectorType& z = *scratch[2]; + DenseVectorType& tmp = *scratch[3]; LinearSolver::Summary summary; summary.termination_type = LinearSolverTerminationType::NO_CONVERGENCE; @@ -160,7 +160,7 @@ SetZero(z); preconditioner.RightMultiplyAndAccumulate(r, z); - double last_rho = rho; + const double last_rho = rho; // rho = r.dot(z); rho = Dot(r, z); if (IsZeroOrInfinity(rho)) { @@ -172,7 +172,7 @@ if (summary.num_iterations == 1) { Copy(z, p); } else { - double beta = rho / last_rho; + const double beta = rho / last_rho; if (IsZeroOrInfinity(beta)) { summary.termination_type = LinearSolverTerminationType::FAILURE; summary.message = StringPrintf(
diff --git a/internal/ceres/conjugate_gradients_solver_test.cc b/internal/ceres/conjugate_gradients_solver_test.cc index a01dfc9..e59eba7 100644 --- a/internal/ceres/conjugate_gradients_solver_test.cc +++ b/internal/ceres/conjugate_gradients_solver_test.cc
@@ -74,9 +74,10 @@ IdentityPreconditioner identity(A->num_cols()); LinearOperatorAdapter lhs(*A); LinearOperatorAdapter preconditioner(identity); - - auto summary = - ConjugateGradientsSolver(cg_options, lhs, b, preconditioner, scratch, x); + Vector* scratch_array[4] = + {&scratch[0], &scratch[1], &scratch[2], &scratch[3]}; + auto summary = ConjugateGradientsSolver( + cg_options, lhs, b, preconditioner, scratch_array, x); EXPECT_EQ(summary.termination_type, LinearSolverTerminationType::SUCCESS); ASSERT_EQ(summary.num_iterations, 1); @@ -135,13 +136,14 @@ for (int i = 0; i < 4; ++i) { scratch[i] = Vector::Zero(A->num_cols()); } - + Vector* scratch_array[4] = + {&scratch[0], &scratch[1], &scratch[2], &scratch[3]}; IdentityPreconditioner identity(A->num_cols()); LinearOperatorAdapter lhs(*A); LinearOperatorAdapter preconditioner(identity); - auto summary = - ConjugateGradientsSolver(cg_options, lhs, b, preconditioner, scratch, x); + auto summary = ConjugateGradientsSolver( + cg_options, lhs, b, preconditioner, scratch_array, x); EXPECT_EQ(summary.termination_type, LinearSolverTerminationType::SUCCESS);
diff --git a/internal/ceres/cuda_dense_cholesky_test.cc b/internal/ceres/cuda_dense_cholesky_test.cc index c7b11ce..1483923 100644 --- a/internal/ceres/cuda_dense_cholesky_test.cc +++ b/internal/ceres/cuda_dense_cholesky_test.cc
@@ -36,8 +36,7 @@ #include "glog/logging.h" #include "gtest/gtest.h" -namespace ceres { -namespace internal { +namespace ceres::internal { #ifndef CERES_NO_CUDA @@ -45,6 +44,8 @@ LinearSolver::Options options; ContextImpl context; options.context = &context; + std::string error; + EXPECT_TRUE(context.InitCUDA(&error)) << error; auto dense_cuda_solver = CUDADenseCholesky::Create(options); EXPECT_EQ(dense_cuda_solver, nullptr); } @@ -63,6 +64,8 @@ LinearSolver::Options options; ContextImpl context; options.context = &context; + std::string error; + EXPECT_TRUE(context.InitCUDA(&error)) << error; options.dense_linear_algebra_library_type = CUDA; auto dense_cuda_solver = CUDADenseCholesky::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); @@ -90,6 +93,8 @@ LinearSolver::Options options; ContextImpl context; options.context = &context; + std::string error; + EXPECT_TRUE(context.InitCUDA(&error)) << error; options.dense_linear_algebra_library_type = CUDA; auto dense_cuda_solver = CUDADenseCholesky::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); @@ -110,6 +115,8 @@ LinearSolver::Options options; ContextImpl context; options.context = &context; + std::string error; + EXPECT_TRUE(context.InitCUDA(&error)) << error; options.dense_linear_algebra_library_type = CUDA; auto dense_cuda_solver = CUDADenseCholesky::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); @@ -123,6 +130,8 @@ LinearSolver::Options options; ContextImpl context; options.context = &context; + std::string error; + EXPECT_TRUE(context.InitCUDA(&error)) << error; options.dense_linear_algebra_library_type = CUDA; auto dense_cuda_solver = CUDADenseCholesky::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); @@ -140,6 +149,8 @@ LinearSolver::Options options; ContextImpl context; options.context = &context; + std::string error; + EXPECT_TRUE(context.InitCUDA(&error)) << error; options.dense_linear_algebra_library_type = ceres::CUDA; std::unique_ptr<DenseCholesky> dense_cholesky = CUDADenseCholesky::Create(options); @@ -175,12 +186,20 @@ { // Did not ask for CUDA, and did not ask for mixed precision. LinearSolver::Options options; + ContextImpl context; + options.context = &context; + std::string error; + EXPECT_TRUE(context.InitCUDA(&error)) << error; auto solver = CUDADenseCholeskyMixedPrecision::Create(options); ASSERT_EQ(solver, nullptr); } { // Asked for CUDA, but did not ask for mixed precision. LinearSolver::Options options; + ContextImpl context; + options.context = &context; + std::string error; + EXPECT_TRUE(context.InitCUDA(&error)) << error; options.dense_linear_algebra_library_type = ceres::CUDA; auto solver = CUDADenseCholeskyMixedPrecision::Create(options); ASSERT_EQ(solver, nullptr); @@ -204,6 +223,8 @@ options.max_num_refinement_iterations = 0; ContextImpl context; options.context = &context; + std::string error; + EXPECT_TRUE(context.InitCUDA(&error)) << error; options.dense_linear_algebra_library_type = CUDA; options.use_mixed_precision_solves = true; auto solver = CUDADenseCholeskyMixedPrecision::Create(options); @@ -239,6 +260,8 @@ options.max_num_refinement_iterations = 3; ContextImpl context; options.context = &context; + std::string error; + EXPECT_TRUE(context.InitCUDA(&error)) << error; options.dense_linear_algebra_library_type = CUDA; options.use_mixed_precision_solves = true; auto solver = CUDADenseCholeskyMixedPrecision::Create(options); @@ -267,6 +290,8 @@ LinearSolver::Options options; ContextImpl context; options.context = &context; + std::string error; + EXPECT_TRUE(context.InitCUDA(&error)) << error; options.dense_linear_algebra_library_type = ceres::CUDA; options.use_mixed_precision_solves = true; options.max_num_refinement_iterations = 20; @@ -302,5 +327,4 @@ #endif // CERES_NO_CUDA -} // namespace internal -} // namespace ceres +} // namespace ceres::internal
diff --git a/internal/ceres/cuda_dense_qr_test.cc b/internal/ceres/cuda_dense_qr_test.cc index 9eb5d4d..cc0dbd6 100644 --- a/internal/ceres/cuda_dense_qr_test.cc +++ b/internal/ceres/cuda_dense_qr_test.cc
@@ -41,6 +41,10 @@ TEST(CUDADenseQR, InvalidOptionOnCreate) { LinearSolver::Options options; + ContextImpl context; + options.context = &context; + std::string error; + EXPECT_TRUE(context.InitCUDA(&error)) << error; auto dense_cuda_solver = CUDADenseQR::Create(options); EXPECT_EQ(dense_cuda_solver, nullptr); } @@ -58,6 +62,8 @@ LinearSolver::Options options; ContextImpl context; options.context = &context; + std::string error; + EXPECT_TRUE(context.InitCUDA(&error)) << error; options.dense_linear_algebra_library_type = CUDA; auto dense_cuda_solver = CUDADenseQR::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); @@ -90,6 +96,8 @@ LinearSolver::Options options; ContextImpl context; options.context = &context; + std::string error; + EXPECT_TRUE(context.InitCUDA(&error)) << error; options.dense_linear_algebra_library_type = CUDA; auto dense_cuda_solver = CUDADenseQR::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); @@ -112,6 +120,8 @@ LinearSolver::Options options; ContextImpl context; options.context = &context; + std::string error; + EXPECT_TRUE(context.InitCUDA(&error)) << error; options.dense_linear_algebra_library_type = CUDA; auto dense_cuda_solver = CUDADenseQR::Create(options); ASSERT_NE(dense_cuda_solver, nullptr); @@ -130,10 +140,12 @@ LinearSolver::Options options; ContextImpl context; options.context = &context; + std::string error; + EXPECT_TRUE(context.InitCUDA(&error)) << error; options.dense_linear_algebra_library_type = ceres::CUDA; std::unique_ptr<DenseQR> dense_qr = CUDADenseQR::Create(options); - const int kNumTrials = 100; + const int kNumTrials = 20; for (int i = 0; i < kNumTrials; ++i) { LhsType lhs = LhsType::Random(kNumRows, kNumCols); SolutionType x_expected = SolutionType::Random(kNumCols);
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc index 244de74..af42e73 100644 --- a/internal/ceres/dense_cholesky.cc +++ b/internal/ceres/dense_cholesky.cc
@@ -348,9 +348,8 @@ #ifndef CERES_NO_CUDA bool CUDADenseCholesky::Init(ContextImpl* context, std::string* message) { - if (!context->InitCUDA(message)) { - return false; - } + CHECK(context->IsCUDAInitialized()) + << "CUDADenseCholesky requires CUDA initialization."; cusolver_handle_ = context->cusolver_handle_; stream_ = context->stream_; error_.Reserve(1); @@ -491,9 +490,8 @@ bool CUDADenseCholeskyMixedPrecision::Init(const LinearSolver::Options& options, std::string* message) { - if (!options.context->InitCUDA(message)) { - return false; - } + CHECK(options.context->IsCUDAInitialized()) + << "CUDADenseCholeskyMixedPrecision requires CUDA initialization."; cusolver_handle_ = options.context->cusolver_handle_; cublas_handle_ = options.context->cublas_handle_; stream_ = options.context->stream_;
diff --git a/internal/ceres/dense_cholesky_test.cc b/internal/ceres/dense_cholesky_test.cc index 1c0df56..34ff4f3 100644 --- a/internal/ceres/dense_cholesky_test.cc +++ b/internal/ceres/dense_cholesky_test.cc
@@ -76,6 +76,8 @@ LinearSolver::Options options; ContextImpl context; options.context = &context; + std::string error; + CHECK(context.InitCUDA(&error)) << error; options.dense_linear_algebra_library_type = ::testing::get<0>(GetParam()); options.use_mixed_precision_solves = ::testing::get<1>(GetParam()); const int kNumRefinementSteps = 4;
diff --git a/internal/ceres/dense_qr_test.cc b/internal/ceres/dense_qr_test.cc index acef750..f562d97 100644 --- a/internal/ceres/dense_qr_test.cc +++ b/internal/ceres/dense_qr_test.cc
@@ -68,6 +68,8 @@ LinearSolver::Options options; ContextImpl context; options.context = &context; + std::string error; + CHECK(context.InitCUDA(&error)) << error; options.dense_linear_algebra_library_type = GetParam(); const double kEpsilon = std::numeric_limits<double>::epsilon() * 1.5e4; std::unique_ptr<DenseQR> dense_qr = DenseQR::Create(options);
diff --git a/internal/ceres/evaluator.cc b/internal/ceres/evaluator.cc index f44796a..91cf221 100644 --- a/internal/ceres/evaluator.cc +++ b/internal/ceres/evaluator.cc
@@ -64,10 +64,17 @@ case DENSE_SCHUR: case SPARSE_SCHUR: case ITERATIVE_SCHUR: - case CGNR: - return std::make_unique< - ProgramEvaluator<BlockEvaluatePreparer, BlockJacobianWriter>>( - options, program); + case CGNR: { + if (options.sparse_linear_algebra_library_type == CUDA_SPARSE) { + return std::make_unique< + ProgramEvaluator<ScratchEvaluatePreparer, + CompressedRowJacobianWriter>>(options, program); + } else { + return std::make_unique< + ProgramEvaluator<BlockEvaluatePreparer, BlockJacobianWriter>>( + options, program); + } + } case SPARSE_NORMAL_CHOLESKY: if (options.dynamic_sparsity) { return std::make_unique<
diff --git a/internal/ceres/evaluator.h b/internal/ceres/evaluator.h index 68a4fb2..7ea523c 100644 --- a/internal/ceres/evaluator.h +++ b/internal/ceres/evaluator.h
@@ -65,6 +65,7 @@ int num_threads = 1; int num_eliminate_blocks = -1; LinearSolverType linear_solver_type = DENSE_QR; + SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type = NO_SPARSE; bool dynamic_sparsity = false; ContextImpl* context = nullptr; EvaluationCallback* evaluation_callback = nullptr;
diff --git a/internal/ceres/iterative_schur_complement_solver.cc b/internal/ceres/iterative_schur_complement_solver.cc index 607502e..0463afb 100644 --- a/internal/ceres/iterative_schur_complement_solver.cc +++ b/internal/ceres/iterative_schur_complement_solver.cc
@@ -129,6 +129,9 @@ for (int i = 0; i < 4; ++i) { scratch[i] = Vector::Zero(schur_complement_->num_cols()); } + Vector* scratch_ptr[4] = { + &scratch[0], &scratch[1], &scratch[2], &scratch[3] + }; event_logger.AddEvent("Setup"); @@ -137,7 +140,7 @@ lhs, schur_complement_->rhs(), preconditioner, - scratch, + scratch_ptr, reduced_linear_system_solution_); if (summary.termination_type != LinearSolverTerminationType::FAILURE &&
diff --git a/internal/ceres/linear_solver.cc b/internal/ceres/linear_solver.cc index d9ef150..2e3fe4b 100644 --- a/internal/ceres/linear_solver.cc +++ b/internal/ceres/linear_solver.cc
@@ -76,8 +76,15 @@ CHECK(options.context != nullptr); switch (options.type) { - case CGNR: + case CGNR: { +#ifndef CERES_NO_CUDA + if (options.sparse_linear_algebra_library_type == CUDA_SPARSE) { + std::string error; + return CudaCgnrSolver::Create(options, &error); + } +#endif return std::make_unique<CgnrSolver>(options); + } break; case SPARSE_NORMAL_CHOLESKY: #if defined(CERES_NO_SPARSE)
diff --git a/internal/ceres/schur_complement_solver.cc b/internal/ceres/schur_complement_solver.cc index ba96e1c..631862c 100644 --- a/internal/ceres/schur_complement_solver.cc +++ b/internal/ceres/schur_complement_solver.cc
@@ -217,7 +217,14 @@ } } -SparseSchurComplementSolver::~SparseSchurComplementSolver() = default; +SparseSchurComplementSolver::~SparseSchurComplementSolver() { + for (int i = 0; i < 4; ++i) { + if (scratch_[i]) { + delete scratch_[i]; + scratch_[i] = nullptr; + } + } +} // Determine the non-zero blocks in the Schur Complement matrix, and // initialize a BlockRandomAccessSparseMatrix object. @@ -394,11 +401,11 @@ cg_options.r_tolerance = per_solve_options.r_tolerance; cg_solution_ = Vector::Zero(sc->num_rows()); - Vector scratch[4]; for (int i = 0; i < 4; ++i) { - scratch_[i] = Vector::Zero(sc->num_rows()); + if (scratch_[i] == nullptr) { + scratch_[i] = new Vector(sc->num_rows()); + } } - auto summary = ConjugateGradientsSolver<Vector>( cg_options, *lhs, rhs(), *preconditioner, scratch_, cg_solution_); VectorRef(solution, sc->num_rows()) = cg_solution_;
diff --git a/internal/ceres/schur_complement_solver.h b/internal/ceres/schur_complement_solver.h index 4fde69c..cac96b5 100644 --- a/internal/ceres/schur_complement_solver.h +++ b/internal/ceres/schur_complement_solver.h
@@ -188,7 +188,7 @@ std::unique_ptr<SparseCholesky> sparse_cholesky_; std::unique_ptr<BlockRandomAccessDiagonalMatrix> preconditioner_; Vector cg_solution_; - Vector scratch_[4]; + Vector* scratch_[4] = {nullptr, nullptr, nullptr, nullptr}; }; } // namespace ceres::internal
diff --git a/internal/ceres/solver.cc b/internal/ceres/solver.cc index b772f3f..0e0790e 100644 --- a/internal/ceres/solver.cc +++ b/internal/ceres/solver.cc
@@ -327,7 +327,8 @@ return false; } - if (options.preconditioner_type == SUBSET) { + if (options.sparse_linear_algebra_library_type != CUDA_SPARSE && + options.preconditioner_type == SUBSET) { if (options.residual_blocks_for_subset_preconditioner.empty()) { *error = "When using SUBSET preconditioner, " @@ -341,6 +342,20 @@ } } + // Check options for CGNR with CUDA_SPARSE. + if (options.sparse_linear_algebra_library_type == CUDA_SPARSE) { + if (!IsSparseLinearAlgebraLibraryTypeAvailable(CUDA_SPARSE)) { + *error = "Can't use CGNR with sparse_linear_algebra_library_type = " + "CUDA_SPARSE because support was not enabled when Ceres was built."; + return false; + } + if (options.preconditioner_type != IDENTITY) { + StringPrintf("Can't use CGNR with preconditioner_type = %s when " + "sparse_linear_algebra_library_type = CUDA_SPARSE.", + PreconditionerTypeToString(options.preconditioner_type)); + return false; + } + } return true; } @@ -652,6 +667,18 @@ return internal::StringPrintf("%s,%s,%s", row.c_str(), e.c_str(), f.c_str()); } +bool IsCudaRequired(const Solver::Options& options) { + if (options.linear_solver_type == DENSE_NORMAL_CHOLESKY || + options.linear_solver_type == DENSE_SCHUR || + options.linear_solver_type == DENSE_QR) { + return (options.dense_linear_algebra_library_type == CUDA); + } + if (options.linear_solver_type == CGNR) { + return (options.sparse_linear_algebra_library_type == CUDA_SPARSE); + } + return false; +} + } // namespace bool Solver::Options::IsValid(string* error) const { @@ -697,6 +724,13 @@ Program* program = problem_impl->mutable_program(); PreSolveSummarize(options, problem_impl, summary); + if (IsCudaRequired(options)) { + if (!problem_impl->context()->InitCUDA(&summary->message)) { + LOG(ERROR) << "Terminating: " << summary->message; + return; + } + } + // If gradient_checking is enabled, wrap all cost functions in a // gradient checker and install a callback that terminates if any gradient // error is detected. @@ -866,22 +900,40 @@ } } - if (linear_solver_type_used == SPARSE_NORMAL_CHOLESKY || + const bool used_sparse_linear_algebra_library = + linear_solver_type_used == SPARSE_NORMAL_CHOLESKY || linear_solver_type_used == SPARSE_SCHUR || - (linear_solver_type_used == CGNR && - preconditioner_type_used == SUBSET) || + linear_solver_type_used == CGNR || (linear_solver_type_used == ITERATIVE_SCHUR && - (preconditioner_type_used == CLUSTER_JACOBI || - preconditioner_type_used == CLUSTER_TRIDIAGONAL))) { + (preconditioner_type_used == CLUSTER_JACOBI || + preconditioner_type_used == CLUSTER_TRIDIAGONAL)); + + const bool linear_solver_ordering_required = + linear_solver_type_used == SPARSE_SCHUR || + (linear_solver_type_used == ITERATIVE_SCHUR && + (preconditioner_type_used == CLUSTER_JACOBI || + preconditioner_type_used == CLUSTER_TRIDIAGONAL)) || + (linear_solver_type_used == CGNR && preconditioner_type_used == SUBSET); + + if (used_sparse_linear_algebra_library) { const char* mixed_precision_suffix = (mixed_precision_solves_used ? "(Mixed Precision)" : ""); - StringAppendF( - &report, - "\nSparse linear algebra library %15s + %s %s\n", - SparseLinearAlgebraLibraryTypeToString( - sparse_linear_algebra_library_type), - LinearSolverOrderingTypeToString(linear_solver_ordering_type), - mixed_precision_suffix); + if (linear_solver_ordering_required) { + StringAppendF( + &report, + "\nSparse linear algebra library %15s + %s %s\n", + SparseLinearAlgebraLibraryTypeToString( + sparse_linear_algebra_library_type), + LinearSolverOrderingTypeToString(linear_solver_ordering_type), + mixed_precision_suffix); + } else { + StringAppendF( + &report, + "\nSparse linear algebra library %15s %s\n", + SparseLinearAlgebraLibraryTypeToString( + sparse_linear_algebra_library_type), + mixed_precision_suffix); + } } StringAppendF(&report, "\n");
diff --git a/internal/ceres/solver_test.cc b/internal/ceres/solver_test.cc index b4b2d34..92c1ddc 100644 --- a/internal/ceres/solver_test.cc +++ b/internal/ceres/solver_test.cc
@@ -878,6 +878,20 @@ options.dynamic_sparsity = false; options.use_mixed_precision_solves = true; EXPECT_FALSE(options.IsValid(&message)); + + options.sparse_linear_algebra_library_type = CUDA_SPARSE; + options.dynamic_sparsity = false; + options.use_mixed_precision_solves = false; + EXPECT_EQ(options.IsValid(&message), + IsSparseLinearAlgebraLibraryTypeAvailable(CUDA_SPARSE)); + + options.dynamic_sparsity = true; + options.use_mixed_precision_solves = false; + EXPECT_FALSE(options.IsValid(&message)); + + options.dynamic_sparsity = false; + options.use_mixed_precision_solves = true; + EXPECT_FALSE(options.IsValid(&message)); } TEST(Solver, CgnrOptionsJacobiPreconditioner) { @@ -940,6 +954,20 @@ options.dynamic_sparsity = false; options.use_mixed_precision_solves = true; EXPECT_FALSE(options.IsValid(&message)); + + options.sparse_linear_algebra_library_type = CUDA_SPARSE; + + options.dynamic_sparsity = false; + options.use_mixed_precision_solves = false; + EXPECT_FALSE(options.IsValid(&message)); + + options.dynamic_sparsity = true; + options.use_mixed_precision_solves = false; + EXPECT_FALSE(options.IsValid(&message)); + + options.dynamic_sparsity = false; + options.use_mixed_precision_solves = true; + EXPECT_FALSE(options.IsValid(&message)); } TEST(Solver, CgnrOptionsSubsetPreconditioner) { @@ -1012,6 +1040,19 @@ options.use_mixed_precision_solves = true; EXPECT_FALSE(options.IsValid(&message)); } + + options.sparse_linear_algebra_library_type = CUDA_SPARSE; + options.dynamic_sparsity = false; + options.use_mixed_precision_solves = false; + EXPECT_FALSE(options.IsValid(&message)); + + options.dynamic_sparsity = true; + options.use_mixed_precision_solves = false; + EXPECT_FALSE(options.IsValid(&message)); + + options.dynamic_sparsity = false; + options.use_mixed_precision_solves = true; + EXPECT_FALSE(options.IsValid(&message)); } TEST(Solver, CgnrOptionsSchurPreconditioners) {
diff --git a/internal/ceres/trust_region_preprocessor.cc b/internal/ceres/trust_region_preprocessor.cc index d27a472..84e783f 100644 --- a/internal/ceres/trust_region_preprocessor.cc +++ b/internal/ceres/trust_region_preprocessor.cc
@@ -261,6 +261,8 @@ const Solver::Options& options = pp->options; pp->evaluator_options = Evaluator::Options(); pp->evaluator_options.linear_solver_type = options.linear_solver_type; + pp->evaluator_options.sparse_linear_algebra_library_type = + options.sparse_linear_algebra_library_type; pp->evaluator_options.num_eliminate_blocks = 0; if (IsSchurType(options.linear_solver_type)) { pp->evaluator_options.num_eliminate_blocks =
diff --git a/internal/ceres/types.cc b/internal/ceres/types.cc index 1d514de..3de91ed 100644 --- a/internal/ceres/types.cc +++ b/internal/ceres/types.cc
@@ -108,6 +108,7 @@ CASESTR(SUITE_SPARSE); CASESTR(EIGEN_SPARSE); CASESTR(ACCELERATE_SPARSE); + CASESTR(CUDA_SPARSE); CASESTR(NO_SPARSE); default: return "UNKNOWN"; @@ -120,6 +121,7 @@ STRENUM(SUITE_SPARSE); STRENUM(EIGEN_SPARSE); STRENUM(ACCELERATE_SPARSE); + STRENUM(CUDA_SPARSE); STRENUM(NO_SPARSE); return false; } @@ -420,6 +422,14 @@ #endif } + if (type == CUDA_SPARSE) { +#ifdef CERES_NO_CUDA + return false; +#else + return true; +#endif + } + if (type == NO_SPARSE) { return true; }