Parallelize PSE preconditioner
Parallelization of remaining block-diagonal matrix-vector product and
vector operations makes parallel execution slightly faster
Before (Intel 8176 CPU, 10 iterations):
-----------------------------------------------------------------------
Benchmark Time
-----------------------------------------------------------------------
PSEPreconditioner...<problem-13682-4456117-pre.txt>/1_median 26677 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/1_stddev 26.6 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/2_median 31037 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/2_stddev 191 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/4_median 16915 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/4_stddev 98.0 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/8_median 9175 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/8_stddev 44.1 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/16_median 4974 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/16_stddev 11.5 ms
After:
-----------------------------------------------------------------------
Benchmark Time
-----------------------------------------------------------------------
PSEPreconditioner...<problem-13682-4456117-pre.txt>/1_median 26609 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/1_stddev 69.4 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/2_median 29178 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/2_stddev 367 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/4_median 16152 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/4_stddev 106 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/8_median 8773 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/8_stddev 41.5 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/16_median 4800 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/16_stddev 14.7 ms
Change-Id: Ib1d1b0c4edf9c556a9e996c49486d2726efcc558
diff --git a/internal/ceres/eigen_vector_ops.h b/internal/ceres/eigen_vector_ops.h
index 525f2dd..b7f57f0 100644
--- a/internal/ceres/eigen_vector_ops.h
+++ b/internal/ceres/eigen_vector_ops.h
@@ -42,7 +42,10 @@
// Blas1 operations on Eigen vectors. These functions are needed as an
// abstraction layer so that we can use different versions of a vector style
// object in the conjugate gradients linear solver.
-inline double Norm(const Vector& x, ContextImpl* context, int num_threads) {
+template <typename Derived>
+inline double Norm(const Eigen::DenseBase<Derived>& x,
+ ContextImpl* context,
+ int num_threads) {
std::vector<double> norms(num_threads);
ParallelFor(context,
0,
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index 15b0534..c679885 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc
@@ -43,6 +43,7 @@
#include "ceres/evaluator.h"
#include "ceres/implicit_schur_complement.h"
#include "ceres/partitioned_matrix_view.h"
+#include "ceres/power_series_expansion_preconditioner.h"
#include "ceres/preprocessor.h"
#include "ceres/problem.h"
#include "ceres/problem_impl.h"
@@ -327,6 +328,29 @@
CHECK_GT(state_plus_delta.squaredNorm(), 0.);
}
+static void PSEPreconditioner(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ LinearSolver::Options options;
+ options.num_threads = static_cast<int>(state.range(0));
+ options.elimination_groups.push_back(data->bal_problem->num_points());
+ options.context = context;
+
+ auto jacobian = data->ImplicitSchurComplementWithDiagonal(options);
+ Preconditioner::Options preconditioner_options(options);
+
+ PowerSeriesExpansionPreconditioner preconditioner(
+ jacobian, 10, 0, preconditioner_options);
+
+ Vector y = Vector::Zero(jacobian->num_cols());
+ Vector x = Vector::Random(jacobian->num_cols());
+
+ for (auto _ : state) {
+ preconditioner.RightMultiplyAndAccumulate(x.data(), y.data());
+ }
+ CHECK_GT(y.squaredNorm(), 0.);
+}
+
static void PMVRightMultiplyAndAccumulateF(benchmark::State& state,
BALData* data,
ContextImpl* context) {
@@ -884,6 +908,16 @@
->Arg(8)
->Arg(16);
+ const std::string name_pse =
+ "PSEPreconditionerRightMultiplyAndAccumulate<" + path + ">";
+ ::benchmark::RegisterBenchmark(
+ name_pse.c_str(), ceres::internal::PSEPreconditioner, data, &context)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
+
const std::string name_isc_no_diag =
"ISCRightMultiplyAndAccumulate<" + path + ">";
::benchmark::RegisterBenchmark(name_isc_no_diag.c_str(),
diff --git a/internal/ceres/iterative_schur_complement_solver.cc b/internal/ceres/iterative_schur_complement_solver.cc
index c2e4b35..bcfb6e4 100644
--- a/internal/ceres/iterative_schur_complement_solver.cc
+++ b/internal/ceres/iterative_schur_complement_solver.cc
@@ -97,10 +97,13 @@
reduced_linear_system_solution_.resize(schur_complement_->num_rows());
reduced_linear_system_solution_.setZero();
if (options_.use_spse_initialization) {
+ Preconditioner::Options preconditioner_options(options_);
+ preconditioner_options.type = SCHUR_POWER_SERIES_EXPANSION;
PowerSeriesExpansionPreconditioner pse_solver(
schur_complement_.get(),
options_.max_num_spse_iterations,
- options_.spse_tolerance);
+ options_.spse_tolerance,
+ preconditioner_options);
pse_solver.RightMultiplyAndAccumulate(
schur_complement_->rhs().data(),
reduced_linear_system_solution_.data());
@@ -158,19 +161,8 @@
return;
}
- Preconditioner::Options preconditioner_options;
- preconditioner_options.type = options_.preconditioner_type;
- preconditioner_options.visibility_clustering_type =
- options_.visibility_clustering_type;
- preconditioner_options.sparse_linear_algebra_library_type =
- options_.sparse_linear_algebra_library_type;
- preconditioner_options.num_threads = options_.num_threads;
- preconditioner_options.row_block_size = options_.row_block_size;
- preconditioner_options.e_block_size = options_.e_block_size;
- preconditioner_options.f_block_size = options_.f_block_size;
- preconditioner_options.elimination_groups = options_.elimination_groups;
+ Preconditioner::Options preconditioner_options(options_);
CHECK(options_.context != nullptr);
- preconditioner_options.context = options_.context;
switch (options_.preconditioner_type) {
case IDENTITY:
@@ -186,7 +178,10 @@
// Ignoring the value of spse_tolerance to ensure preconditioner stays
// fixed during the iterations of cg.
preconditioner_ = std::make_unique<PowerSeriesExpansionPreconditioner>(
- schur_complement_.get(), options_.max_num_spse_iterations, 0);
+ schur_complement_.get(),
+ options_.max_num_spse_iterations,
+ 0,
+ preconditioner_options);
break;
case SCHUR_JACOBI:
preconditioner_ = std::make_unique<SchurJacobiPreconditioner>(
diff --git a/internal/ceres/power_series_expansion_preconditioner.cc b/internal/ceres/power_series_expansion_preconditioner.cc
index 76de2fc..53fdc27 100644
--- a/internal/ceres/power_series_expansion_preconditioner.cc
+++ b/internal/ceres/power_series_expansion_preconditioner.cc
@@ -30,15 +30,20 @@
#include "ceres/power_series_expansion_preconditioner.h"
+#include "ceres/eigen_vector_ops.h"
+#include "ceres/parallel_vector_ops.h"
+
namespace ceres::internal {
PowerSeriesExpansionPreconditioner::PowerSeriesExpansionPreconditioner(
const ImplicitSchurComplement* isc,
const int max_num_spse_iterations,
- const double spse_tolerance)
+ const double spse_tolerance,
+ const Preconditioner::Options& options)
: isc_(isc),
max_num_spse_iterations_(max_num_spse_iterations),
- spse_tolerance_(spse_tolerance) {}
+ spse_tolerance_(spse_tolerance),
+ options_(options) {}
PowerSeriesExpansionPreconditioner::~PowerSeriesExpansionPreconditioner() =
default;
@@ -53,17 +58,21 @@
VectorRef yref(y, num_rows());
Vector series_term(num_rows());
Vector previous_series_term(num_rows());
- yref.setZero();
- isc_->block_diagonal_FtF_inverse()->RightMultiplyAndAccumulate(x, y);
- previous_series_term = yref;
+ ParallelSetZero(options_.context, options_.num_threads, yref);
+ isc_->block_diagonal_FtF_inverse()->RightMultiplyAndAccumulate(
+ x, y, options_.context, options_.num_threads);
+ ParallelAssign(
+ options_.context, options_.num_threads, previous_series_term, yref);
- const double norm_threshold = spse_tolerance_ * yref.norm();
+ const double norm_threshold =
+ spse_tolerance_ * Norm(yref, options_.context, options_.num_threads);
for (int i = 1;; i++) {
- series_term.setZero();
+ ParallelSetZero(options_.context, options_.num_threads, series_term);
isc_->InversePowerSeriesOperatorRightMultiplyAccumulate(
previous_series_term.data(), series_term.data());
- yref += series_term;
+ ParallelAssign(
+ options_.context, options_.num_threads, yref, yref + series_term);
if (i >= max_num_spse_iterations_ || series_term.norm() < norm_threshold) {
break;
}
diff --git a/internal/ceres/power_series_expansion_preconditioner.h b/internal/ceres/power_series_expansion_preconditioner.h
index 84e721e..9a993cf 100644
--- a/internal/ceres/power_series_expansion_preconditioner.h
+++ b/internal/ceres/power_series_expansion_preconditioner.h
@@ -41,20 +41,15 @@
// This is a preconditioner via power series expansion of Schur
// complement inverse based on "Weber et al, Power Bundle Adjustment for
// Large-Scale 3D Reconstruction".
-//
-//
-// TODO(https://github.com/ceres-solver/ceres-solver/issues/934): In
-// PowerSeriesExpansionPreconditioner::RightMultiplyAndAccumulate only
-// operations performed via ImplicitSchurComplement are threaded.
-// PowerSeriesExpansionPreconditioner will benefit from multithreading
-// applied to remaning operations (block-sparse right product and several
-// vector operations)
class CERES_NO_EXPORT PowerSeriesExpansionPreconditioner
: public Preconditioner {
public:
+ // TODO: Consider moving max_num_spse_iterations and spse_tolerance to
+ // Preconditioner::Options
PowerSeriesExpansionPreconditioner(const ImplicitSchurComplement* isc,
const int max_num_spse_iterations,
- const double spse_tolerance);
+ const double spse_tolerance,
+ const Preconditioner::Options& options);
PowerSeriesExpansionPreconditioner(
const PowerSeriesExpansionPreconditioner&) = delete;
void operator=(const PowerSeriesExpansionPreconditioner&) = delete;
@@ -68,6 +63,7 @@
const ImplicitSchurComplement* isc_;
const int max_num_spse_iterations_;
const double spse_tolerance_;
+ const Preconditioner::Options options_;
};
} // namespace ceres::internal
diff --git a/internal/ceres/power_series_expansion_preconditioner_test.cc b/internal/ceres/power_series_expansion_preconditioner_test.cc
index 6afca0b..1c04162 100644
--- a/internal/ceres/power_series_expansion_preconditioner_test.cc
+++ b/internal/ceres/power_series_expansion_preconditioner_test.cc
@@ -49,6 +49,7 @@
options_.elimination_groups.push_back(problem_->num_eliminate_blocks);
options_.preconditioner_type = SCHUR_POWER_SERIES_EXPANSION;
+ preconditioner_options_ = Preconditioner::Options(options_);
isc_ = std::make_unique<ImplicitSchurComplement>(options_);
isc_->Init(*A, D, problem_->b.get());
num_f_cols_ = isc_->rhs().rows();
@@ -78,6 +79,7 @@
int num_f_cols_;
Matrix sc_inverse_expected_;
LinearSolver::Options options_;
+ Preconditioner::Options preconditioner_options_;
};
TEST_F(PowerSeriesExpansionPreconditionerTest,
@@ -85,7 +87,7 @@
const double spse_tolerance = kEpsilon;
const int max_num_iterations = 50;
PowerSeriesExpansionPreconditioner preconditioner(
- isc_.get(), max_num_iterations, spse_tolerance);
+ isc_.get(), max_num_iterations, spse_tolerance, preconditioner_options_);
Vector x(num_f_cols_), y(num_f_cols_);
for (int i = 0; i < num_f_cols_; i++) {
@@ -108,7 +110,7 @@
const double spse_tolerance = 0;
const int max_num_iterations = 50;
PowerSeriesExpansionPreconditioner preconditioner_fixed_n_iterations(
- isc_.get(), max_num_iterations, spse_tolerance);
+ isc_.get(), max_num_iterations, spse_tolerance, preconditioner_options_);
Vector x(num_f_cols_), y(num_f_cols_);
for (int i = 0; i < num_f_cols_; i++) {
@@ -132,7 +134,7 @@
const double spse_tolerance = 1 / kEpsilon;
const int max_num_iterations = 50;
PowerSeriesExpansionPreconditioner preconditioner_bad_tolerance(
- isc_.get(), max_num_iterations, spse_tolerance);
+ isc_.get(), max_num_iterations, spse_tolerance, preconditioner_options_);
Vector x(num_f_cols_), y(num_f_cols_);
for (int i = 0; i < num_f_cols_; i++) {
@@ -153,7 +155,7 @@
const double spse_tolerance = kEpsilon;
const int max_num_iterations = 1;
PowerSeriesExpansionPreconditioner preconditioner_bad_iterations_limit(
- isc_.get(), max_num_iterations, spse_tolerance);
+ isc_.get(), max_num_iterations, spse_tolerance, preconditioner_options_);
Vector x(num_f_cols_), y(num_f_cols_);
for (int i = 0; i < num_f_cols_; i++) {
diff --git a/internal/ceres/preconditioner.h b/internal/ceres/preconditioner.h
index 2903d3b..07a7b1f 100644
--- a/internal/ceres/preconditioner.h
+++ b/internal/ceres/preconditioner.h
@@ -51,6 +51,20 @@
class CERES_NO_EXPORT Preconditioner : public LinearOperator {
public:
struct Options {
+ Options() = default;
+ Options(const LinearSolver::Options& linear_solver_options)
+ : type(linear_solver_options.preconditioner_type),
+ visibility_clustering_type(
+ linear_solver_options.visibility_clustering_type),
+ sparse_linear_algebra_library_type(
+ linear_solver_options.sparse_linear_algebra_library_type),
+ num_threads(linear_solver_options.num_threads),
+ row_block_size(linear_solver_options.row_block_size),
+ e_block_size(linear_solver_options.e_block_size),
+ f_block_size(linear_solver_options.f_block_size),
+ elimination_groups(linear_solver_options.elimination_groups),
+ context(linear_solver_options.context) {}
+
PreconditionerType type = JACOBI;
VisibilityClusteringType visibility_clustering_type = CANONICAL_VIEWS;
SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type =