Parallelize PSE preconditioner Parallelization of remaining block-diagonal matrix-vector product and vector operations makes parallel execution slightly faster Before (Intel 8176 CPU, 10 iterations): ----------------------------------------------------------------------- Benchmark Time ----------------------------------------------------------------------- PSEPreconditioner...<problem-13682-4456117-pre.txt>/1_median 26677 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/1_stddev 26.6 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/2_median 31037 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/2_stddev 191 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/4_median 16915 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/4_stddev 98.0 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/8_median 9175 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/8_stddev 44.1 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/16_median 4974 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/16_stddev 11.5 ms After: ----------------------------------------------------------------------- Benchmark Time ----------------------------------------------------------------------- PSEPreconditioner...<problem-13682-4456117-pre.txt>/1_median 26609 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/1_stddev 69.4 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/2_median 29178 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/2_stddev 367 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/4_median 16152 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/4_stddev 106 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/8_median 8773 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/8_stddev 41.5 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/16_median 4800 ms PSEPreconditioner...<problem-13682-4456117-pre.txt>/16_stddev 14.7 ms Change-Id: Ib1d1b0c4edf9c556a9e996c49486d2726efcc558
diff --git a/internal/ceres/eigen_vector_ops.h b/internal/ceres/eigen_vector_ops.h index 525f2dd..b7f57f0 100644 --- a/internal/ceres/eigen_vector_ops.h +++ b/internal/ceres/eigen_vector_ops.h
@@ -42,7 +42,10 @@ // Blas1 operations on Eigen vectors. These functions are needed as an // abstraction layer so that we can use different versions of a vector style // object in the conjugate gradients linear solver. -inline double Norm(const Vector& x, ContextImpl* context, int num_threads) { +template <typename Derived> +inline double Norm(const Eigen::DenseBase<Derived>& x, + ContextImpl* context, + int num_threads) { std::vector<double> norms(num_threads); ParallelFor(context, 0,
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc index 15b0534..c679885 100644 --- a/internal/ceres/evaluation_benchmark.cc +++ b/internal/ceres/evaluation_benchmark.cc
@@ -43,6 +43,7 @@ #include "ceres/evaluator.h" #include "ceres/implicit_schur_complement.h" #include "ceres/partitioned_matrix_view.h" +#include "ceres/power_series_expansion_preconditioner.h" #include "ceres/preprocessor.h" #include "ceres/problem.h" #include "ceres/problem_impl.h" @@ -327,6 +328,29 @@ CHECK_GT(state_plus_delta.squaredNorm(), 0.); } +static void PSEPreconditioner(benchmark::State& state, + BALData* data, + ContextImpl* context) { + LinearSolver::Options options; + options.num_threads = static_cast<int>(state.range(0)); + options.elimination_groups.push_back(data->bal_problem->num_points()); + options.context = context; + + auto jacobian = data->ImplicitSchurComplementWithDiagonal(options); + Preconditioner::Options preconditioner_options(options); + + PowerSeriesExpansionPreconditioner preconditioner( + jacobian, 10, 0, preconditioner_options); + + Vector y = Vector::Zero(jacobian->num_cols()); + Vector x = Vector::Random(jacobian->num_cols()); + + for (auto _ : state) { + preconditioner.RightMultiplyAndAccumulate(x.data(), y.data()); + } + CHECK_GT(y.squaredNorm(), 0.); +} + static void PMVRightMultiplyAndAccumulateF(benchmark::State& state, BALData* data, ContextImpl* context) { @@ -884,6 +908,16 @@ ->Arg(8) ->Arg(16); + const std::string name_pse = + "PSEPreconditionerRightMultiplyAndAccumulate<" + path + ">"; + ::benchmark::RegisterBenchmark( + name_pse.c_str(), ceres::internal::PSEPreconditioner, data, &context) + ->Arg(1) + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16); + const std::string name_isc_no_diag = "ISCRightMultiplyAndAccumulate<" + path + ">"; ::benchmark::RegisterBenchmark(name_isc_no_diag.c_str(),
diff --git a/internal/ceres/iterative_schur_complement_solver.cc b/internal/ceres/iterative_schur_complement_solver.cc index c2e4b35..bcfb6e4 100644 --- a/internal/ceres/iterative_schur_complement_solver.cc +++ b/internal/ceres/iterative_schur_complement_solver.cc
@@ -97,10 +97,13 @@ reduced_linear_system_solution_.resize(schur_complement_->num_rows()); reduced_linear_system_solution_.setZero(); if (options_.use_spse_initialization) { + Preconditioner::Options preconditioner_options(options_); + preconditioner_options.type = SCHUR_POWER_SERIES_EXPANSION; PowerSeriesExpansionPreconditioner pse_solver( schur_complement_.get(), options_.max_num_spse_iterations, - options_.spse_tolerance); + options_.spse_tolerance, + preconditioner_options); pse_solver.RightMultiplyAndAccumulate( schur_complement_->rhs().data(), reduced_linear_system_solution_.data()); @@ -158,19 +161,8 @@ return; } - Preconditioner::Options preconditioner_options; - preconditioner_options.type = options_.preconditioner_type; - preconditioner_options.visibility_clustering_type = - options_.visibility_clustering_type; - preconditioner_options.sparse_linear_algebra_library_type = - options_.sparse_linear_algebra_library_type; - preconditioner_options.num_threads = options_.num_threads; - preconditioner_options.row_block_size = options_.row_block_size; - preconditioner_options.e_block_size = options_.e_block_size; - preconditioner_options.f_block_size = options_.f_block_size; - preconditioner_options.elimination_groups = options_.elimination_groups; + Preconditioner::Options preconditioner_options(options_); CHECK(options_.context != nullptr); - preconditioner_options.context = options_.context; switch (options_.preconditioner_type) { case IDENTITY: @@ -186,7 +178,10 @@ // Ignoring the value of spse_tolerance to ensure preconditioner stays // fixed during the iterations of cg. preconditioner_ = std::make_unique<PowerSeriesExpansionPreconditioner>( - schur_complement_.get(), options_.max_num_spse_iterations, 0); + schur_complement_.get(), + options_.max_num_spse_iterations, + 0, + preconditioner_options); break; case SCHUR_JACOBI: preconditioner_ = std::make_unique<SchurJacobiPreconditioner>(
diff --git a/internal/ceres/power_series_expansion_preconditioner.cc b/internal/ceres/power_series_expansion_preconditioner.cc index 76de2fc..53fdc27 100644 --- a/internal/ceres/power_series_expansion_preconditioner.cc +++ b/internal/ceres/power_series_expansion_preconditioner.cc
@@ -30,15 +30,20 @@ #include "ceres/power_series_expansion_preconditioner.h" +#include "ceres/eigen_vector_ops.h" +#include "ceres/parallel_vector_ops.h" + namespace ceres::internal { PowerSeriesExpansionPreconditioner::PowerSeriesExpansionPreconditioner( const ImplicitSchurComplement* isc, const int max_num_spse_iterations, - const double spse_tolerance) + const double spse_tolerance, + const Preconditioner::Options& options) : isc_(isc), max_num_spse_iterations_(max_num_spse_iterations), - spse_tolerance_(spse_tolerance) {} + spse_tolerance_(spse_tolerance), + options_(options) {} PowerSeriesExpansionPreconditioner::~PowerSeriesExpansionPreconditioner() = default; @@ -53,17 +58,21 @@ VectorRef yref(y, num_rows()); Vector series_term(num_rows()); Vector previous_series_term(num_rows()); - yref.setZero(); - isc_->block_diagonal_FtF_inverse()->RightMultiplyAndAccumulate(x, y); - previous_series_term = yref; + ParallelSetZero(options_.context, options_.num_threads, yref); + isc_->block_diagonal_FtF_inverse()->RightMultiplyAndAccumulate( + x, y, options_.context, options_.num_threads); + ParallelAssign( + options_.context, options_.num_threads, previous_series_term, yref); - const double norm_threshold = spse_tolerance_ * yref.norm(); + const double norm_threshold = + spse_tolerance_ * Norm(yref, options_.context, options_.num_threads); for (int i = 1;; i++) { - series_term.setZero(); + ParallelSetZero(options_.context, options_.num_threads, series_term); isc_->InversePowerSeriesOperatorRightMultiplyAccumulate( previous_series_term.data(), series_term.data()); - yref += series_term; + ParallelAssign( + options_.context, options_.num_threads, yref, yref + series_term); if (i >= max_num_spse_iterations_ || series_term.norm() < norm_threshold) { break; }
diff --git a/internal/ceres/power_series_expansion_preconditioner.h b/internal/ceres/power_series_expansion_preconditioner.h index 84e721e..9a993cf 100644 --- a/internal/ceres/power_series_expansion_preconditioner.h +++ b/internal/ceres/power_series_expansion_preconditioner.h
@@ -41,20 +41,15 @@ // This is a preconditioner via power series expansion of Schur // complement inverse based on "Weber et al, Power Bundle Adjustment for // Large-Scale 3D Reconstruction". -// -// -// TODO(https://github.com/ceres-solver/ceres-solver/issues/934): In -// PowerSeriesExpansionPreconditioner::RightMultiplyAndAccumulate only -// operations performed via ImplicitSchurComplement are threaded. -// PowerSeriesExpansionPreconditioner will benefit from multithreading -// applied to remaning operations (block-sparse right product and several -// vector operations) class CERES_NO_EXPORT PowerSeriesExpansionPreconditioner : public Preconditioner { public: + // TODO: Consider moving max_num_spse_iterations and spse_tolerance to + // Preconditioner::Options PowerSeriesExpansionPreconditioner(const ImplicitSchurComplement* isc, const int max_num_spse_iterations, - const double spse_tolerance); + const double spse_tolerance, + const Preconditioner::Options& options); PowerSeriesExpansionPreconditioner( const PowerSeriesExpansionPreconditioner&) = delete; void operator=(const PowerSeriesExpansionPreconditioner&) = delete; @@ -68,6 +63,7 @@ const ImplicitSchurComplement* isc_; const int max_num_spse_iterations_; const double spse_tolerance_; + const Preconditioner::Options options_; }; } // namespace ceres::internal
diff --git a/internal/ceres/power_series_expansion_preconditioner_test.cc b/internal/ceres/power_series_expansion_preconditioner_test.cc index 6afca0b..1c04162 100644 --- a/internal/ceres/power_series_expansion_preconditioner_test.cc +++ b/internal/ceres/power_series_expansion_preconditioner_test.cc
@@ -49,6 +49,7 @@ options_.elimination_groups.push_back(problem_->num_eliminate_blocks); options_.preconditioner_type = SCHUR_POWER_SERIES_EXPANSION; + preconditioner_options_ = Preconditioner::Options(options_); isc_ = std::make_unique<ImplicitSchurComplement>(options_); isc_->Init(*A, D, problem_->b.get()); num_f_cols_ = isc_->rhs().rows(); @@ -78,6 +79,7 @@ int num_f_cols_; Matrix sc_inverse_expected_; LinearSolver::Options options_; + Preconditioner::Options preconditioner_options_; }; TEST_F(PowerSeriesExpansionPreconditionerTest, @@ -85,7 +87,7 @@ const double spse_tolerance = kEpsilon; const int max_num_iterations = 50; PowerSeriesExpansionPreconditioner preconditioner( - isc_.get(), max_num_iterations, spse_tolerance); + isc_.get(), max_num_iterations, spse_tolerance, preconditioner_options_); Vector x(num_f_cols_), y(num_f_cols_); for (int i = 0; i < num_f_cols_; i++) { @@ -108,7 +110,7 @@ const double spse_tolerance = 0; const int max_num_iterations = 50; PowerSeriesExpansionPreconditioner preconditioner_fixed_n_iterations( - isc_.get(), max_num_iterations, spse_tolerance); + isc_.get(), max_num_iterations, spse_tolerance, preconditioner_options_); Vector x(num_f_cols_), y(num_f_cols_); for (int i = 0; i < num_f_cols_; i++) { @@ -132,7 +134,7 @@ const double spse_tolerance = 1 / kEpsilon; const int max_num_iterations = 50; PowerSeriesExpansionPreconditioner preconditioner_bad_tolerance( - isc_.get(), max_num_iterations, spse_tolerance); + isc_.get(), max_num_iterations, spse_tolerance, preconditioner_options_); Vector x(num_f_cols_), y(num_f_cols_); for (int i = 0; i < num_f_cols_; i++) { @@ -153,7 +155,7 @@ const double spse_tolerance = kEpsilon; const int max_num_iterations = 1; PowerSeriesExpansionPreconditioner preconditioner_bad_iterations_limit( - isc_.get(), max_num_iterations, spse_tolerance); + isc_.get(), max_num_iterations, spse_tolerance, preconditioner_options_); Vector x(num_f_cols_), y(num_f_cols_); for (int i = 0; i < num_f_cols_; i++) {
diff --git a/internal/ceres/preconditioner.h b/internal/ceres/preconditioner.h index 2903d3b..07a7b1f 100644 --- a/internal/ceres/preconditioner.h +++ b/internal/ceres/preconditioner.h
@@ -51,6 +51,20 @@ class CERES_NO_EXPORT Preconditioner : public LinearOperator { public: struct Options { + Options() = default; + Options(const LinearSolver::Options& linear_solver_options) + : type(linear_solver_options.preconditioner_type), + visibility_clustering_type( + linear_solver_options.visibility_clustering_type), + sparse_linear_algebra_library_type( + linear_solver_options.sparse_linear_algebra_library_type), + num_threads(linear_solver_options.num_threads), + row_block_size(linear_solver_options.row_block_size), + e_block_size(linear_solver_options.e_block_size), + f_block_size(linear_solver_options.f_block_size), + elimination_groups(linear_solver_options.elimination_groups), + context(linear_solver_options.context) {} + PreconditionerType type = JACOBI; VisibilityClusteringType visibility_clustering_type = CANONICAL_VIEWS; SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type =