Added CUDA Jacobi Preconditioner
* Added CudaCpuPreconditionerWrapper, a templated wrapper to leverage
existing CPU preconditioners that provide CompressedRowSparse
representation, for use with CudaCgnrSolver.
* Added CudaJacobiPreconditioner as a specific instantiation of
CudaCpuPreconditionerWrapper over BlockCRSJacobiPreconditioner.
The resulting Jacobi-preconditioned CudaCgnrSolver exhibits moderate
increase in speed with a healthy increase in numerical stability.
Examples of running CudaCgnrSolver with Jacobi vs. Identity
preconditioner for bundle adjustment on problem-13682-4456117-pre.txt
on a desktop computer with an Intel(R) Core(TM) i9-9940X CPU @ 3.30GHz
and an Nvidia Quadro RTX 6000 GPU.
A known issue is that the Jacobi preconditioner update is slow - this
can be sped up in the future by multithreading on the CPU, or by
computing the preconditioner on the GPU.
====================================================================
CUDA CGNR + IDENTITY Preconditioner
====================================================================
iter cost cost_change |gradient| |step| tr_ratio tr_radius ls_iter iter_time total_time
0 1.126372e+09 0.00e+00 3.02e+15 0.00e+00 0.00e+00 1.00e+04 0 9.17e+00 2.70e+01
1 3.796903e+07 1.09e+09 1.08e+15 6.48e+03 9.95e-01 3.00e+04 49 1.55e+01 4.24e+01
2 3.529024e+21 -3.53e+21 1.08e+15 1.93e+04 -2.25e+14 1.50e+04 500 3.79e+01 8.03e+01
3 7.952482e+09 -7.91e+09 1.08e+15 1.63e+04 -5.16e+02 3.75e+03 386 2.96e+01 1.10e+02
4 8.842366e+08 -8.46e+08 1.08e+15 1.18e+04 -5.87e+01 4.69e+02 312 2.47e+01 1.35e+02
5 2.935800e+07 8.61e+06 1.24e+13 3.93e+03 8.19e-01 6.32e+02 129 1.96e+01 1.54e+02
6 2.529342e+07 4.06e+06 3.22e+12 3.39e+03 9.93e-01 1.90e+03 148 2.16e+01 1.76e+02
7 2.343028e+07 1.86e+06 1.27e+13 6.00e+03 9.37e-01 5.68e+03 387 3.79e+01 2.14e+02
8 2.332405e+07 1.06e+05 7.92e+12 6.47e+01 8.26e-01 7.87e+03 46 1.48e+01 2.28e+02
9 3.591512e+14 -3.59e+14 7.92e+12 5.19e+03 -4.09e+08 3.94e+03 500 3.83e+01 2.67e+02
10 2.956078e+07 -6.24e+06 7.92e+12 3.94e+03 -8.34e+00 9.84e+02 447 3.40e+01 3.01e+02
Solver Summary (v 2.2.0-eigen-(3.3.7)-lapack-suitesparse-(5.7.1)-metis-(5.1.0)-eigensparse-no_openmp-cuda-(11070))
Original Reduced
Parameter blocks 4469799 4469799
Parameters 13491489 13491489
Residual blocks 28987644 28987644
Residuals 57975288 57975288
Minimizer TRUST_REGION
Trust region strategy LEVENBERG_MARQUARDT
Sparse linear algebra library CUDA_SPARSE
Given Used
Linear solver CGNR CGNR
Preconditioner IDENTITY IDENTITY
Threads 24 24
Linear solver ordering 4456117,13682 4456117,13682
Cost:
Initial 1.126372e+09
Final 2.332405e+07
Change 1.103048e+09
Minimizer iterations 11
Successful steps 6
Unsuccessful steps 5
Time (in seconds):
Preprocessor 17.798455
Residual only evaluation 8.656350 (10)
Jacobian & residual evaluation 35.794169 (6)
Linear solver 208.586109 (10)
Minimizer 283.593882
Postprocessor 1.985112
Total 303.377449
====================================================================
CUDA CGNR + JACOBI Preconditioner
====================================================================
iter cost cost_change |gradient| |step| tr_ratio tr_radius ls_iter iter_time total_time
0 1.126372e+09 0.00e+00 3.02e+15 0.00e+00 0.00e+00 1.00e+04 0 9.18e+00 2.68e+01
1 1.491225e+14 -1.49e+14 3.02e+15 8.73e+05 -1.36e+05 5.00e+03 6 1.07e+01 3.74e+01
2 7.461790e+12 -7.46e+12 3.02e+15 4.78e+05 -6.78e+03 1.25e+03 6 8.88e+00 4.63e+01
3 4.357438e+07 1.08e+09 5.41e+13 1.57e+05 9.87e-01 3.75e+03 6 1.62e+01 6.25e+01
4 2.928453e+07 1.43e+07 6.85e+13 3.81e+05 6.97e-01 3.99e+03 23 1.82e+01 8.08e+01
5 5.977433e+14 -5.98e+14 6.85e+13 4.97e+05 -8.70e+07 2.00e+03 9 9.93e+00 9.07e+01
6 6.423965e+10 -6.42e+10 6.85e+13 2.45e+05 -9.61e+03 4.99e+02 7 8.95e+00 9.97e+01
7 2.337471e+07 5.91e+06 1.49e+12 8.24e+04 9.31e-01 1.39e+03 4 1.60e+01 1.16e+02
8 2.257524e+07 7.99e+05 8.71e+12 1.62e+05 8.99e-01 2.81e+03 54 2.03e+01 1.36e+02
9 2.285090e+07 -2.76e+05 8.71e+12 3.56e+05 -6.13e-01 1.41e+03 104 1.65e+01 1.52e+02
10 5.909983e+12 -5.91e+12 8.71e+12 1.78e+05 -1.83e+07 3.52e+02 74 1.36e+01 1.66e+02
Solver Summary (v 2.2.0-eigen-(3.3.7)-lapack-suitesparse-(5.7.1)-metis-(5.1.0)-eigensparse-no_openmp-cuda-(11070))
Original Reduced
Parameter blocks 4469799 4469799
Parameters 13491489 13491489
Residual blocks 28987644 28987644
Residuals 57975288 57975288
Minimizer TRUST_REGION
Trust region strategy LEVENBERG_MARQUARDT
Sparse linear algebra library CUDA_SPARSE
Given Used
Linear solver CGNR CGNR
Preconditioner JACOBI JACOBI
Threads 24 24
Linear solver ordering 4456117,13682 4456117,13682
Cost:
Initial 1.126372e+09
Final 2.257524e+07
Change 1.103796e+09
Minimizer iterations 11
Successful steps 5
Unsuccessful steps 6
Time (in seconds):
Preprocessor 17.575990
Residual only evaluation 8.461414 (10)
Jacobian & residual evaluation 30.055383 (5)
Linear solver 82.550675 (10)
Minimizer 149.137856
Postprocessor 1.994014
Total 168.707860
Change-Id: I458d2445bf062e54de44fc91517ed11a300c7182
diff --git a/internal/ceres/cgnr_solver.cc b/internal/ceres/cgnr_solver.cc
index 5374f51..015e170 100644
--- a/internal/ceres/cgnr_solver.cc
+++ b/internal/ceres/cgnr_solver.cc
@@ -224,13 +224,41 @@
};
class CERES_NO_EXPORT CudaIdentityPreconditioner final
- : public ConjugateGradientsLinearOperator<CudaVector> {
+ : public CudaPreconditioner {
public:
+ void Update(const CompressedRowSparseMatrix& A, const double* D) final {}
void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector& y) final {
y.Axpby(1.0, x, 1.0);
}
};
+// This class wraps the existing CPU Jacobi preconditioner, caches the structure
+// of the block diagonal, and for each CGNR solve updates the values on the CPU
+// and then copies them over to the GPU.
+class CERES_NO_EXPORT CudaJacobiPreconditioner final
+ : public CudaPreconditioner {
+ public:
+ explicit CudaJacobiPreconditioner(ContextImpl* context,
+ const CompressedRowSparseMatrix& A) :
+ cpu_preconditioner_(A),
+ m_(context, cpu_preconditioner_.matrix()) {}
+ ~CudaJacobiPreconditioner() = default;
+
+ void Update(const CompressedRowSparseMatrix& A, const double* D) final {
+ cpu_preconditioner_.Update(A, D);
+ m_.CopyValuesFromCpu(cpu_preconditioner_.matrix());
+ }
+
+ void RightMultiplyAndAccumulate(
+ const CudaVector& x, CudaVector& y) final {
+ m_.RightMultiplyAndAccumulate(x, &y);
+ }
+
+ private:
+ BlockCRSJacobiPreconditioner cpu_preconditioner_;
+ CudaSparseMatrix m_;
+};
+
CudaCgnrSolver::CudaCgnrSolver(LinearSolver::Options options)
: options_(std::move(options)) {}
@@ -246,7 +274,8 @@
std::unique_ptr<CudaCgnrSolver> CudaCgnrSolver::Create(
LinearSolver::Options options, std::string* error) {
CHECK(error != nullptr);
- if (options.preconditioner_type != IDENTITY) {
+ if (options.preconditioner_type != IDENTITY &&
+ options.preconditioner_type != JACOBI) {
*error =
"CudaCgnrSolver does not support preconditioner type " +
std::string(PreconditionerTypeToString(options.preconditioner_type)) +
@@ -270,6 +299,12 @@
Atb_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
Ax_ = std::make_unique<CudaVector>(options_.context, A.num_rows());
D_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
+ if (options_.preconditioner_type == JACOBI) {
+ preconditioner_ =
+ std::make_unique<CudaJacobiPreconditioner>(options_.context, A);
+ } else {
+ preconditioner_ = std::make_unique<CudaIdentityPreconditioner>();
+ }
for (int i = 0; i < 4; ++i) {
scratch_[i] = new CudaVector(options_.context, A.num_cols());
}
@@ -293,6 +328,8 @@
CpuToGpuTransfer(*A, b, per_solve_options.D);
event_logger.AddEvent("CPU to GPU Transfer");
+ preconditioner_->Update(*A, per_solve_options.D);
+ event_logger.AddEvent("Preconditioner Update");
// Form z = Atb.
Atb_->SetZero();
@@ -311,9 +348,8 @@
cg_options.q_tolerance = per_solve_options.q_tolerance;
cg_options.r_tolerance = per_solve_options.r_tolerance;
- CudaIdentityPreconditioner preconditioner;
summary = ConjugateGradientsSolver(
- cg_options, lhs, *Atb_, preconditioner, scratch_, *x_);
+ cg_options, lhs, *Atb_, *preconditioner_, scratch_, *x_);
x_->CopyTo(x);
event_logger.AddEvent("Solve");
return summary;
diff --git a/internal/ceres/cgnr_solver.h b/internal/ceres/cgnr_solver.h
index 9078d01..5111d80 100644
--- a/internal/ceres/cgnr_solver.h
+++ b/internal/ceres/cgnr_solver.h
@@ -33,6 +33,7 @@
#include <memory>
+#include "ceres/conjugate_gradients_solver.h"
#include "ceres/cuda_sparse_matrix.h"
#include "ceres/cuda_vector.h"
#include "ceres/internal/export.h"
@@ -71,6 +72,13 @@
};
#ifndef CERES_NO_CUDA
+class CudaPreconditioner : public
+ ConjugateGradientsLinearOperator<CudaVector> {
+ public:
+ virtual void Update(const CompressedRowSparseMatrix& A, const double* D) = 0;
+ virtual ~CudaPreconditioner() = default;
+};
+
// A Cuda-accelerated version of CgnrSolver.
// This solver assumes that the sparsity structure of A remains constant for its
// lifetime.
@@ -99,6 +107,7 @@
std::unique_ptr<CudaVector> Atb_;
std::unique_ptr<CudaVector> Ax_;
std::unique_ptr<CudaVector> D_;
+ std::unique_ptr<CudaPreconditioner> preconditioner_;
CudaVector* scratch_[4] = {nullptr, nullptr, nullptr, nullptr};
};
#endif // CERES_NO_CUDA
diff --git a/internal/ceres/solver.cc b/internal/ceres/solver.cc
index 0656d34..c896e32 100644
--- a/internal/ceres/solver.cc
+++ b/internal/ceres/solver.cc
@@ -327,8 +327,14 @@
return false;
}
- if (options.sparse_linear_algebra_library_type != CUDA_SPARSE &&
- options.preconditioner_type == SUBSET) {
+ if (options.preconditioner_type == SUBSET) {
+ if (options.sparse_linear_algebra_library_type == CUDA_SPARSE) {
+ *error =
+ "Can't use CGNR with preconditioner_type = SUBSET when "
+ "sparse_linear_algebra_library_type = CUDA_SPARSE.";
+ return false;
+ }
+
if (options.residual_blocks_for_subset_preconditioner.empty()) {
*error =
"When using SUBSET preconditioner, "
@@ -350,13 +356,6 @@
"CUDA_SPARSE because support was not enabled when Ceres was built.";
return false;
}
- if (options.preconditioner_type != IDENTITY) {
- *error = StringPrintf(
- "Can't use CGNR with preconditioner_type = %s when "
- "sparse_linear_algebra_library_type = CUDA_SPARSE.",
- PreconditionerTypeToString(options.preconditioner_type));
- return false;
- }
}
return true;
}
diff --git a/internal/ceres/solver_test.cc b/internal/ceres/solver_test.cc
index b796873..929e293 100644
--- a/internal/ceres/solver_test.cc
+++ b/internal/ceres/solver_test.cc
@@ -959,7 +959,8 @@
options.dynamic_sparsity = false;
options.use_mixed_precision_solves = false;
- EXPECT_FALSE(options.IsValid(&message));
+ EXPECT_EQ(options.IsValid(&message),
+ IsSparseLinearAlgebraLibraryTypeAvailable(CUDA_SPARSE));
options.dynamic_sparsity = true;
options.use_mixed_precision_solves = false;