Parallelize PSE preconditioner

Parallelization of remaining block-diagonal matrix-vector product and
vector operations makes parallel execution slightly faster

Before (Intel 8176 CPU, 10 iterations):
-----------------------------------------------------------------------
Benchmark                                                          Time
-----------------------------------------------------------------------
PSEPreconditioner...<problem-13682-4456117-pre.txt>/1_median   26677 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/1_stddev    26.6 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/2_median   31037 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/2_stddev     191 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/4_median   16915 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/4_stddev    98.0 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/8_median    9175 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/8_stddev    44.1 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/16_median   4974 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/16_stddev   11.5 ms

After:
-----------------------------------------------------------------------
Benchmark                                                          Time
-----------------------------------------------------------------------
PSEPreconditioner...<problem-13682-4456117-pre.txt>/1_median   26609 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/1_stddev    69.4 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/2_median   29178 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/2_stddev     367 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/4_median   16152 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/4_stddev     106 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/8_median    8773 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/8_stddev    41.5 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/16_median   4800 ms
PSEPreconditioner...<problem-13682-4456117-pre.txt>/16_stddev   14.7 ms

Change-Id: Ib1d1b0c4edf9c556a9e996c49486d2726efcc558
diff --git a/internal/ceres/eigen_vector_ops.h b/internal/ceres/eigen_vector_ops.h
index 525f2dd..b7f57f0 100644
--- a/internal/ceres/eigen_vector_ops.h
+++ b/internal/ceres/eigen_vector_ops.h
@@ -42,7 +42,10 @@
 // Blas1 operations on Eigen vectors. These functions are needed as an
 // abstraction layer so that we can use different versions of a vector style
 // object in the conjugate gradients linear solver.
-inline double Norm(const Vector& x, ContextImpl* context, int num_threads) {
+template <typename Derived>
+inline double Norm(const Eigen::DenseBase<Derived>& x,
+                   ContextImpl* context,
+                   int num_threads) {
   std::vector<double> norms(num_threads);
   ParallelFor(context,
               0,
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index 15b0534..c679885 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc
@@ -43,6 +43,7 @@
 #include "ceres/evaluator.h"
 #include "ceres/implicit_schur_complement.h"
 #include "ceres/partitioned_matrix_view.h"
+#include "ceres/power_series_expansion_preconditioner.h"
 #include "ceres/preprocessor.h"
 #include "ceres/problem.h"
 #include "ceres/problem_impl.h"
@@ -327,6 +328,29 @@
   CHECK_GT(state_plus_delta.squaredNorm(), 0.);
 }
 
+static void PSEPreconditioner(benchmark::State& state,
+                              BALData* data,
+                              ContextImpl* context) {
+  LinearSolver::Options options;
+  options.num_threads = static_cast<int>(state.range(0));
+  options.elimination_groups.push_back(data->bal_problem->num_points());
+  options.context = context;
+
+  auto jacobian = data->ImplicitSchurComplementWithDiagonal(options);
+  Preconditioner::Options preconditioner_options(options);
+
+  PowerSeriesExpansionPreconditioner preconditioner(
+      jacobian, 10, 0, preconditioner_options);
+
+  Vector y = Vector::Zero(jacobian->num_cols());
+  Vector x = Vector::Random(jacobian->num_cols());
+
+  for (auto _ : state) {
+    preconditioner.RightMultiplyAndAccumulate(x.data(), y.data());
+  }
+  CHECK_GT(y.squaredNorm(), 0.);
+}
+
 static void PMVRightMultiplyAndAccumulateF(benchmark::State& state,
                                            BALData* data,
                                            ContextImpl* context) {
@@ -884,6 +908,16 @@
         ->Arg(8)
         ->Arg(16);
 
+    const std::string name_pse =
+        "PSEPreconditionerRightMultiplyAndAccumulate<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_pse.c_str(), ceres::internal::PSEPreconditioner, data, &context)
+        ->Arg(1)
+        ->Arg(2)
+        ->Arg(4)
+        ->Arg(8)
+        ->Arg(16);
+
     const std::string name_isc_no_diag =
         "ISCRightMultiplyAndAccumulate<" + path + ">";
     ::benchmark::RegisterBenchmark(name_isc_no_diag.c_str(),
diff --git a/internal/ceres/iterative_schur_complement_solver.cc b/internal/ceres/iterative_schur_complement_solver.cc
index c2e4b35..bcfb6e4 100644
--- a/internal/ceres/iterative_schur_complement_solver.cc
+++ b/internal/ceres/iterative_schur_complement_solver.cc
@@ -97,10 +97,13 @@
   reduced_linear_system_solution_.resize(schur_complement_->num_rows());
   reduced_linear_system_solution_.setZero();
   if (options_.use_spse_initialization) {
+    Preconditioner::Options preconditioner_options(options_);
+    preconditioner_options.type = SCHUR_POWER_SERIES_EXPANSION;
     PowerSeriesExpansionPreconditioner pse_solver(
         schur_complement_.get(),
         options_.max_num_spse_iterations,
-        options_.spse_tolerance);
+        options_.spse_tolerance,
+        preconditioner_options);
     pse_solver.RightMultiplyAndAccumulate(
         schur_complement_->rhs().data(),
         reduced_linear_system_solution_.data());
@@ -158,19 +161,8 @@
     return;
   }
 
-  Preconditioner::Options preconditioner_options;
-  preconditioner_options.type = options_.preconditioner_type;
-  preconditioner_options.visibility_clustering_type =
-      options_.visibility_clustering_type;
-  preconditioner_options.sparse_linear_algebra_library_type =
-      options_.sparse_linear_algebra_library_type;
-  preconditioner_options.num_threads = options_.num_threads;
-  preconditioner_options.row_block_size = options_.row_block_size;
-  preconditioner_options.e_block_size = options_.e_block_size;
-  preconditioner_options.f_block_size = options_.f_block_size;
-  preconditioner_options.elimination_groups = options_.elimination_groups;
+  Preconditioner::Options preconditioner_options(options_);
   CHECK(options_.context != nullptr);
-  preconditioner_options.context = options_.context;
 
   switch (options_.preconditioner_type) {
     case IDENTITY:
@@ -186,7 +178,10 @@
       // Ignoring the value of spse_tolerance to ensure preconditioner stays
       // fixed during the iterations of cg.
       preconditioner_ = std::make_unique<PowerSeriesExpansionPreconditioner>(
-          schur_complement_.get(), options_.max_num_spse_iterations, 0);
+          schur_complement_.get(),
+          options_.max_num_spse_iterations,
+          0,
+          preconditioner_options);
       break;
     case SCHUR_JACOBI:
       preconditioner_ = std::make_unique<SchurJacobiPreconditioner>(
diff --git a/internal/ceres/power_series_expansion_preconditioner.cc b/internal/ceres/power_series_expansion_preconditioner.cc
index 76de2fc..53fdc27 100644
--- a/internal/ceres/power_series_expansion_preconditioner.cc
+++ b/internal/ceres/power_series_expansion_preconditioner.cc
@@ -30,15 +30,20 @@
 
 #include "ceres/power_series_expansion_preconditioner.h"
 
+#include "ceres/eigen_vector_ops.h"
+#include "ceres/parallel_vector_ops.h"
+
 namespace ceres::internal {
 
 PowerSeriesExpansionPreconditioner::PowerSeriesExpansionPreconditioner(
     const ImplicitSchurComplement* isc,
     const int max_num_spse_iterations,
-    const double spse_tolerance)
+    const double spse_tolerance,
+    const Preconditioner::Options& options)
     : isc_(isc),
       max_num_spse_iterations_(max_num_spse_iterations),
-      spse_tolerance_(spse_tolerance) {}
+      spse_tolerance_(spse_tolerance),
+      options_(options) {}
 
 PowerSeriesExpansionPreconditioner::~PowerSeriesExpansionPreconditioner() =
     default;
@@ -53,17 +58,21 @@
   VectorRef yref(y, num_rows());
   Vector series_term(num_rows());
   Vector previous_series_term(num_rows());
-  yref.setZero();
-  isc_->block_diagonal_FtF_inverse()->RightMultiplyAndAccumulate(x, y);
-  previous_series_term = yref;
+  ParallelSetZero(options_.context, options_.num_threads, yref);
+  isc_->block_diagonal_FtF_inverse()->RightMultiplyAndAccumulate(
+      x, y, options_.context, options_.num_threads);
+  ParallelAssign(
+      options_.context, options_.num_threads, previous_series_term, yref);
 
-  const double norm_threshold = spse_tolerance_ * yref.norm();
+  const double norm_threshold =
+      spse_tolerance_ * Norm(yref, options_.context, options_.num_threads);
 
   for (int i = 1;; i++) {
-    series_term.setZero();
+    ParallelSetZero(options_.context, options_.num_threads, series_term);
     isc_->InversePowerSeriesOperatorRightMultiplyAccumulate(
         previous_series_term.data(), series_term.data());
-    yref += series_term;
+    ParallelAssign(
+        options_.context, options_.num_threads, yref, yref + series_term);
     if (i >= max_num_spse_iterations_ || series_term.norm() < norm_threshold) {
       break;
     }
diff --git a/internal/ceres/power_series_expansion_preconditioner.h b/internal/ceres/power_series_expansion_preconditioner.h
index 84e721e..9a993cf 100644
--- a/internal/ceres/power_series_expansion_preconditioner.h
+++ b/internal/ceres/power_series_expansion_preconditioner.h
@@ -41,20 +41,15 @@
 // This is a preconditioner via power series expansion of Schur
 // complement inverse based on "Weber et al, Power Bundle Adjustment for
 // Large-Scale 3D Reconstruction".
-//
-//
-// TODO(https://github.com/ceres-solver/ceres-solver/issues/934): In
-// PowerSeriesExpansionPreconditioner::RightMultiplyAndAccumulate only
-// operations performed via ImplicitSchurComplement are threaded.
-// PowerSeriesExpansionPreconditioner will benefit from multithreading
-// applied to remaning operations (block-sparse right product and several
-// vector operations)
 class CERES_NO_EXPORT PowerSeriesExpansionPreconditioner
     : public Preconditioner {
  public:
+  // TODO: Consider moving max_num_spse_iterations and spse_tolerance to
+  // Preconditioner::Options
   PowerSeriesExpansionPreconditioner(const ImplicitSchurComplement* isc,
                                      const int max_num_spse_iterations,
-                                     const double spse_tolerance);
+                                     const double spse_tolerance,
+                                     const Preconditioner::Options& options);
   PowerSeriesExpansionPreconditioner(
       const PowerSeriesExpansionPreconditioner&) = delete;
   void operator=(const PowerSeriesExpansionPreconditioner&) = delete;
@@ -68,6 +63,7 @@
   const ImplicitSchurComplement* isc_;
   const int max_num_spse_iterations_;
   const double spse_tolerance_;
+  const Preconditioner::Options options_;
 };
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/power_series_expansion_preconditioner_test.cc b/internal/ceres/power_series_expansion_preconditioner_test.cc
index 6afca0b..1c04162 100644
--- a/internal/ceres/power_series_expansion_preconditioner_test.cc
+++ b/internal/ceres/power_series_expansion_preconditioner_test.cc
@@ -49,6 +49,7 @@
 
     options_.elimination_groups.push_back(problem_->num_eliminate_blocks);
     options_.preconditioner_type = SCHUR_POWER_SERIES_EXPANSION;
+    preconditioner_options_ = Preconditioner::Options(options_);
     isc_ = std::make_unique<ImplicitSchurComplement>(options_);
     isc_->Init(*A, D, problem_->b.get());
     num_f_cols_ = isc_->rhs().rows();
@@ -78,6 +79,7 @@
   int num_f_cols_;
   Matrix sc_inverse_expected_;
   LinearSolver::Options options_;
+  Preconditioner::Options preconditioner_options_;
 };
 
 TEST_F(PowerSeriesExpansionPreconditionerTest,
@@ -85,7 +87,7 @@
   const double spse_tolerance = kEpsilon;
   const int max_num_iterations = 50;
   PowerSeriesExpansionPreconditioner preconditioner(
-      isc_.get(), max_num_iterations, spse_tolerance);
+      isc_.get(), max_num_iterations, spse_tolerance, preconditioner_options_);
 
   Vector x(num_f_cols_), y(num_f_cols_);
   for (int i = 0; i < num_f_cols_; i++) {
@@ -108,7 +110,7 @@
   const double spse_tolerance = 0;
   const int max_num_iterations = 50;
   PowerSeriesExpansionPreconditioner preconditioner_fixed_n_iterations(
-      isc_.get(), max_num_iterations, spse_tolerance);
+      isc_.get(), max_num_iterations, spse_tolerance, preconditioner_options_);
 
   Vector x(num_f_cols_), y(num_f_cols_);
   for (int i = 0; i < num_f_cols_; i++) {
@@ -132,7 +134,7 @@
   const double spse_tolerance = 1 / kEpsilon;
   const int max_num_iterations = 50;
   PowerSeriesExpansionPreconditioner preconditioner_bad_tolerance(
-      isc_.get(), max_num_iterations, spse_tolerance);
+      isc_.get(), max_num_iterations, spse_tolerance, preconditioner_options_);
 
   Vector x(num_f_cols_), y(num_f_cols_);
   for (int i = 0; i < num_f_cols_; i++) {
@@ -153,7 +155,7 @@
   const double spse_tolerance = kEpsilon;
   const int max_num_iterations = 1;
   PowerSeriesExpansionPreconditioner preconditioner_bad_iterations_limit(
-      isc_.get(), max_num_iterations, spse_tolerance);
+      isc_.get(), max_num_iterations, spse_tolerance, preconditioner_options_);
 
   Vector x(num_f_cols_), y(num_f_cols_);
   for (int i = 0; i < num_f_cols_; i++) {
diff --git a/internal/ceres/preconditioner.h b/internal/ceres/preconditioner.h
index 2903d3b..07a7b1f 100644
--- a/internal/ceres/preconditioner.h
+++ b/internal/ceres/preconditioner.h
@@ -51,6 +51,20 @@
 class CERES_NO_EXPORT Preconditioner : public LinearOperator {
  public:
   struct Options {
+    Options() = default;
+    Options(const LinearSolver::Options& linear_solver_options)
+        : type(linear_solver_options.preconditioner_type),
+          visibility_clustering_type(
+              linear_solver_options.visibility_clustering_type),
+          sparse_linear_algebra_library_type(
+              linear_solver_options.sparse_linear_algebra_library_type),
+          num_threads(linear_solver_options.num_threads),
+          row_block_size(linear_solver_options.row_block_size),
+          e_block_size(linear_solver_options.e_block_size),
+          f_block_size(linear_solver_options.f_block_size),
+          elimination_groups(linear_solver_options.elimination_groups),
+          context(linear_solver_options.context) {}
+
     PreconditionerType type = JACOBI;
     VisibilityClusteringType visibility_clustering_type = CANONICAL_VIEWS;
     SparseLinearAlgebraLibraryType sparse_linear_algebra_library_type =