Refactor PartitionedMatrixView to cache the partitions The constructor now takes a LinearSolver::Options as input and uses that to compute the partitioning once and uses it for its lifetime. Change-Id: I9ef30df0b60f8fa91c8b5601c397b2d9314a2cc7

diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 4a85e8b..06c0914 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt

@@ -603,9 +603,6 @@
   add_executable(spmv_benchmark spmv_benchmark.cc)
   add_dependencies_to_benchmark(spmv_benchmark)
 
-  add_executable(partitioned_matrix_view_benchmark partitioned_matrix_view_benchmark.cc)
-  add_dependencies_to_benchmark(partitioned_matrix_view_benchmark)
-
   add_executable(block_jacobi_preconditioner_benchmark
     block_jacobi_preconditioner_benchmark.cc)
   add_dependencies_to_benchmark(block_jacobi_preconditioner_benchmark)

diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index 2f9fff0..9849b5a 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc

@@ -145,23 +145,20 @@
     return crs_jacobian.get();
   }
 
-  const PartitionedView* PartitionedMatrixViewJacobian(ContextImpl* context) {
-    if (!partitioned_view_jacobian) {
-      auto block_sparse = BlockSparseJacobian(context);
-      partitioned_view_jacobian = std::make_unique<PartitionedView>(
-          *block_sparse, bal_problem->num_points());
-    }
+  const PartitionedView* PartitionedMatrixViewJacobian(
+      const LinearSolver::Options& options) {
+    auto block_sparse = BlockSparseJacobian(options.context);
+    partitioned_view_jacobian =
+        std::make_unique<PartitionedView>(options, *block_sparse);
     return partitioned_view_jacobian.get();
   }
 
   const PartitionedView* PartitionedMatrixViewJacobianWithTranspose(
-      ContextImpl* context) {
-    if (!partitioned_view_jacobian_with_transpose) {
-      auto block_sparse_transpose = BlockSparseJacobianWithTranspose(context);
-      partitioned_view_jacobian_with_transpose =
-          std::make_unique<PartitionedView>(*block_sparse_transpose,
-                                            bal_problem->num_points());
-    }
+      const LinearSolver::Options& options) {
+    auto block_sparse_transpose =
+        BlockSparseJacobianWithTranspose(options.context);
+    partitioned_view_jacobian_with_transpose =
+        std::make_unique<PartitionedView>(options, *block_sparse_transpose);
     return partitioned_view_jacobian_with_transpose.get();
   }
 
@@ -243,16 +240,17 @@
 static void PMVRightMultiplyAndAccumulateF(benchmark::State& state,
                                            BALData* data,
                                            ContextImpl* context) {
-  const int num_threads = state.range(0);
-
-  auto jacobian = data->PartitionedMatrixViewJacobian(context);
+  LinearSolver::Options options;
+  options.num_threads = state.range(0);
+  options.elimination_groups.push_back(data->bal_problem->num_points());
+  options.context = context;
+  auto jacobian = data->PartitionedMatrixViewJacobian(options);
 
   Vector y = Vector::Zero(jacobian->num_rows());
   Vector x = Vector::Random(jacobian->num_cols_f());
 
   for (auto _ : state) {
-    jacobian->RightMultiplyAndAccumulateF(
-        x.data(), y.data(), context, num_threads);
+    jacobian->RightMultiplyAndAccumulateF(x.data(), y.data());
   }
   CHECK_GT(y.squaredNorm(), 0.);
 }
@@ -260,49 +258,35 @@
 static void PMVLeftMultiplyAndAccumulateF(benchmark::State& state,
                                           BALData* data,
                                           ContextImpl* context) {
-  const int num_threads = state.range(0);
-
-  auto jacobian = data->PartitionedMatrixViewJacobian(context);
+  LinearSolver::Options options;
+  options.num_threads = state.range(0);
+  options.elimination_groups.push_back(data->bal_problem->num_points());
+  options.context = context;
+  auto jacobian = data->PartitionedMatrixViewJacobianWithTranspose(options);
 
   Vector y = Vector::Zero(jacobian->num_cols_f());
   Vector x = Vector::Random(jacobian->num_rows());
 
   for (auto _ : state) {
-    jacobian->LeftMultiplyAndAccumulateF(
-        x.data(), y.data(), context, num_threads);
+    jacobian->LeftMultiplyAndAccumulateF(x.data(), y.data());
   }
   CHECK_GT(y.squaredNorm(), 0.);
 }
 
-static void PMVLeftMultiplyAndAccumulateWithTransposeF(benchmark::State& state,
-                                                       BALData* data,
-                                                       ContextImpl* context) {
-  const int num_threads = state.range(0);
-
-  auto jacobian = data->PartitionedMatrixViewJacobianWithTranspose(context);
-
-  Vector y = Vector::Zero(jacobian->num_cols_f());
-  Vector x = Vector::Random(jacobian->num_rows());
-
-  for (auto _ : state) {
-    jacobian->LeftMultiplyAndAccumulateF(
-        x.data(), y.data(), context, num_threads);
-  }
-  CHECK_GT(y.squaredNorm(), 0.);
-}
 static void PMVRightMultiplyAndAccumulateE(benchmark::State& state,
                                            BALData* data,
                                            ContextImpl* context) {
-  const int num_threads = state.range(0);
-
-  auto jacobian = data->PartitionedMatrixViewJacobian(context);
+  LinearSolver::Options options;
+  options.num_threads = state.range(0);
+  options.elimination_groups.push_back(data->bal_problem->num_points());
+  options.context = context;
+  auto jacobian = data->PartitionedMatrixViewJacobian(options);
 
   Vector y = Vector::Zero(jacobian->num_rows());
   Vector x = Vector::Random(jacobian->num_cols_e());
 
   for (auto _ : state) {
-    jacobian->RightMultiplyAndAccumulateE(
-        x.data(), y.data(), context, num_threads);
+    jacobian->RightMultiplyAndAccumulateE(x.data(), y.data());
   }
   CHECK_GT(y.squaredNorm(), 0.);
 }
@@ -310,33 +294,17 @@
 static void PMVLeftMultiplyAndAccumulateE(benchmark::State& state,
                                           BALData* data,
                                           ContextImpl* context) {
-  const int num_threads = state.range(0);
-
-  auto jacobian = data->PartitionedMatrixViewJacobian(context);
+  LinearSolver::Options options;
+  options.num_threads = state.range(0);
+  options.elimination_groups.push_back(data->bal_problem->num_points());
+  options.context = context;
+  auto jacobian = data->PartitionedMatrixViewJacobianWithTranspose(options);
 
   Vector y = Vector::Zero(jacobian->num_cols_e());
   Vector x = Vector::Random(jacobian->num_rows());
 
   for (auto _ : state) {
-    jacobian->LeftMultiplyAndAccumulateE(
-        x.data(), y.data(), context, num_threads);
-  }
-  CHECK_GT(y.squaredNorm(), 0.);
-}
-
-static void PMVLeftMultiplyAndAccumulateWithTransposeE(benchmark::State& state,
-                                                       BALData* data,
-                                                       ContextImpl* context) {
-  const int num_threads = state.range(0);
-
-  auto jacobian = data->PartitionedMatrixViewJacobianWithTranspose(context);
-
-  Vector y = Vector::Zero(jacobian->num_cols_e());
-  Vector x = Vector::Random(jacobian->num_rows());
-
-  for (auto _ : state) {
-    jacobian->LeftMultiplyAndAccumulateE(
-        x.data(), y.data(), context, num_threads);
+    jacobian->LeftMultiplyAndAccumulateE(x.data(), y.data());
   }
   CHECK_GT(y.squaredNorm(), 0.);
 }
@@ -363,22 +331,6 @@
                                               ContextImpl* context) {
   const int num_threads = state.range(0);
 
-  auto jacobian = data->BlockSparseJacobian(context);
-
-  Vector y = Vector::Zero(jacobian->num_cols());
-  Vector x = Vector::Random(jacobian->num_rows());
-
-  for (auto _ : state) {
-    jacobian->LeftMultiplyAndAccumulate(
-        x.data(), y.data(), context, num_threads);
-  }
-  CHECK_GT(y.squaredNorm(), 0.);
-}
-
-static void JacobianLeftMultiplyAndAccumulateWithTranspose(
-    benchmark::State& state, BALData* data, ContextImpl* context) {
-  const int num_threads = state.range(0);
-
   auto jacobian = data->BlockSparseJacobianWithTranspose(context);
 
   Vector y = Vector::Zero(jacobian->num_cols());
@@ -551,15 +503,6 @@
         ceres::internal::JacobianLeftMultiplyAndAccumulate,
         data,
         &context)
-        ->Arg(1);
-
-    const std::string name_left_product_transpose =
-        "JacobianLeftMultiplyAndAccumulateWithTranspose<" + path + ">";
-    ::benchmark::RegisterBenchmark(
-        name_left_product_transpose.c_str(),
-        ceres::internal::JacobianLeftMultiplyAndAccumulateWithTranspose,
-        data,
-        &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
@@ -573,14 +516,6 @@
         ceres::internal::PMVLeftMultiplyAndAccumulateF,
         data,
         &context)
-        ->Arg(1);
-    const std::string name_left_product_partitioned_transpose_f =
-        "PMVLeftMultiplyAndAccumulateWithTransposeF<" + path + ">";
-    ::benchmark::RegisterBenchmark(
-        name_left_product_partitioned_transpose_f.c_str(),
-        ceres::internal::PMVLeftMultiplyAndAccumulateWithTransposeF,
-        data,
-        &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
@@ -594,15 +529,6 @@
         ceres::internal::PMVLeftMultiplyAndAccumulateE,
         data,
         &context)
-        ->Arg(1);
-
-    const std::string name_left_product_partitioned_transpose_e =
-        "PMVLeftMultiplyAndAccumulateWithTransposeE<" + path + ">";
-    ::benchmark::RegisterBenchmark(
-        name_left_product_partitioned_transpose_e.c_str(),
-        ceres::internal::PMVLeftMultiplyAndAccumulateWithTransposeE,
-        data,
-        &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)

diff --git a/internal/ceres/fake_bundle_adjustment_jacobian.cc b/internal/ceres/fake_bundle_adjustment_jacobian.cc
index af1d03c..d8f3c01 100644
--- a/internal/ceres/fake_bundle_adjustment_jacobian.cc
+++ b/internal/ceres/fake_bundle_adjustment_jacobian.cc

@@ -109,8 +109,10 @@
       PartitionedMatrixView<2, Eigen::Dynamic, Eigen::Dynamic>;
   auto block_sparse_matrix = CreateFakeBundleAdjustmentJacobian(
       num_cameras, num_points, camera_size, landmark_size, visibility, rng);
+  LinearSolver::Options options;
+  options.elimination_groups.push_back(num_points);
   auto partitioned_view =
-      std::make_unique<PartitionedView>(*block_sparse_matrix, num_points);
+      std::make_unique<PartitionedView>(options, *block_sparse_matrix);
   return std::make_pair(std::move(partitioned_view),
                         std::move(block_sparse_matrix));
 }

diff --git a/internal/ceres/implicit_schur_complement.cc b/internal/ceres/implicit_schur_complement.cc
index 6249c8c..4e36048 100644
--- a/internal/ceres/implicit_schur_complement.cc
+++ b/internal/ceres/implicit_schur_complement.cc

@@ -105,15 +105,11 @@
                                                          double* y) const {
   // y1 = F x
   tmp_rows_.setZero();
-  A_->RightMultiplyAndAccumulateF(
-      x, tmp_rows_.data(), options_.context, options_.num_threads);
+  A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());
 
   // y2 = E' y1
   tmp_e_cols_.setZero();
-  A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(),
-                                 tmp_e_cols_.data(),
-                                 options_.context,
-                                 options_.num_threads);
+  A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());
 
   // y3 = -(E'E)^-1 y2
   tmp_e_cols_2_.setZero();
@@ -121,13 +117,11 @@
                                                           tmp_e_cols_2_.data(),
                                                           options_.context,
                                                           options_.num_threads);
+
   tmp_e_cols_2_ *= -1.0;
 
   // y1 = y1 + E y3
-  A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(),
-                                  tmp_rows_.data(),
-                                  options_.context,
-                                  options_.num_threads);
+  A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data());
 
   // y5 = D * x
   if (D_ != nullptr) {
@@ -140,8 +134,7 @@
   }
 
   // y = y5 + F' y1
-  A_->LeftMultiplyAndAccumulateF(
-      tmp_rows_.data(), y, options_.context, options_.num_threads);
+  A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), y);
 }
 
 void ImplicitSchurComplement::InversePowerSeriesOperatorRightMultiplyAccumulate(
@@ -149,15 +142,11 @@
   CHECK(compute_ftf_inverse_);
   // y1 = F x
   tmp_rows_.setZero();
-  A_->RightMultiplyAndAccumulateF(
-      x, tmp_rows_.data(), options_.context, options_.num_threads);
+  A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());
 
   // y2 = E' y1
   tmp_e_cols_.setZero();
-  A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(),
-                                 tmp_e_cols_.data(),
-                                 options_.context,
-                                 options_.num_threads);
+  A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());
 
   // y3 = (E'E)^-1 y2
   tmp_e_cols_2_.setZero();
@@ -167,17 +156,11 @@
                                                           options_.num_threads);
   // y1 = E y3
   tmp_rows_.setZero();
-  A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(),
-                                  tmp_rows_.data(),
-                                  options_.context,
-                                  options_.num_threads);
+  A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data());
 
   // y4 = F' y1
   tmp_f_cols_.setZero();
-  A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(),
-                                 tmp_f_cols_.data(),
-                                 options_.context,
-                                 options_.num_threads);
+  A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), tmp_f_cols_.data());
 
   // y += (F'F)^-1 y4
   block_diagonal_FtF_inverse_->RightMultiplyAndAccumulate(
@@ -219,18 +202,14 @@
 
   // y1 = F x
   tmp_rows_.setZero();
-  A_->RightMultiplyAndAccumulateF(
-      x, tmp_rows_.data(), options_.context, options_.num_threads);
+  A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());
 
   // y2 = b - y1
   tmp_rows_ = ConstVectorRef(b_, num_rows) - tmp_rows_;
 
   // y3 = E' y2
   tmp_e_cols_.setZero();
-  A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(),
-                                 tmp_e_cols_.data(),
-                                 options_.context,
-                                 options_.num_threads);
+  A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());
 
   // y = (E'E)^-1 y3
   VectorRef(y, num_cols).setZero();
@@ -254,8 +233,7 @@
 void ImplicitSchurComplement::UpdateRhs() {
   // y1 = E'b
   tmp_e_cols_.setZero();
-  A_->LeftMultiplyAndAccumulateE(
-      b_, tmp_e_cols_.data(), options_.context, options_.num_threads);
+  A_->LeftMultiplyAndAccumulateE(b_, tmp_e_cols_.data());
 
   // y2 = (E'E)^-1 y1
   Vector y2 = Vector::Zero(A_->num_cols_e());
@@ -264,16 +242,14 @@
 
   // y3 = E y2
   tmp_rows_.setZero();
-  A_->RightMultiplyAndAccumulateE(
-      y2.data(), tmp_rows_.data(), options_.context, options_.num_threads);
+  A_->RightMultiplyAndAccumulateE(y2.data(), tmp_rows_.data());
 
   // y3 = b - y3
   tmp_rows_ = ConstVectorRef(b_, A_->num_rows()) - tmp_rows_;
 
   // rhs = F' y3
   rhs_.setZero();
-  A_->LeftMultiplyAndAccumulateF(
-      tmp_rows_.data(), rhs_.data(), options_.context, options_.num_threads);
+  A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), rhs_.data());
 }
 
 }  // namespace ceres::internal

diff --git a/internal/ceres/parallel_for.h b/internal/ceres/parallel_for.h
index 107d19c..fe5d242 100644
--- a/internal/ceres/parallel_for.h
+++ b/internal/ceres/parallel_for.h

@@ -299,6 +299,42 @@
               });
 }
 
+// Execute function for every element in the range [start, end) with at most
+// num_threads, using the user provided partitioning. taking into account
+// user-provided integer cumulative costs of iterations.
+template <typename F>
+void ParallelFor(ContextImpl* context,
+                 int start,
+                 int end,
+                 int num_threads,
+                 const F& function,
+                 const std::vector<int>& partitions) {
+  using namespace parallel_for_details;
+  CHECK_GT(num_threads, 0);
+  if (start >= end) {
+    return;
+  }
+  CHECK_EQ(partitions.front(), start);
+  CHECK_EQ(partitions.back(), end);
+  if (num_threads == 1 || end - start <= num_threads) {
+    ParallelFor(context, start, end, num_threads, function);
+    return;
+  }
+  CHECK_GT(partitions.size(), 1);
+  const int num_partitions = partitions.size() - 1;
+  ParallelFor(context,
+              0,
+              num_partitions,
+              num_threads,
+              [&function, &partitions](int thread_id, int partition_id) {
+                const int partition_start = partitions[partition_id];
+                const int partition_end = partitions[partition_id + 1];
+
+                for (int i = partition_start; i < partition_end; ++i) {
+                  Invoke<F>(thread_id, i, function);
+                }
+              });
+}
 }  // namespace ceres::internal
 
 // Backend-specific implementations of ParallelInvoke

diff --git a/internal/ceres/partitioned_matrix_view.cc b/internal/ceres/partitioned_matrix_view.cc
index 2710366..d952a2a 100644
--- a/internal/ceres/partitioned_matrix_view.cc
+++ b/internal/ceres/partitioned_matrix_view.cc

@@ -55,121 +55,121 @@
      (options.e_block_size == 2) &&
      (options.f_block_size == 2)) {
     return std::make_unique<PartitionedMatrixView<2,2, 2>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 2) &&
      (options.e_block_size == 2) &&
      (options.f_block_size == 3)) {
     return std::make_unique<PartitionedMatrixView<2,2, 3>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 2) &&
      (options.e_block_size == 2) &&
      (options.f_block_size == 4)) {
     return std::make_unique<PartitionedMatrixView<2,2, 4>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 2) &&
      (options.e_block_size == 2)) {
     return std::make_unique<PartitionedMatrixView<2,2, Eigen::Dynamic>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 2) &&
      (options.e_block_size == 3) &&
      (options.f_block_size == 3)) {
     return std::make_unique<PartitionedMatrixView<2,3, 3>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 2) &&
      (options.e_block_size == 3) &&
      (options.f_block_size == 4)) {
     return std::make_unique<PartitionedMatrixView<2,3, 4>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 2) &&
      (options.e_block_size == 3) &&
      (options.f_block_size == 6)) {
     return std::make_unique<PartitionedMatrixView<2,3, 6>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 2) &&
      (options.e_block_size == 3) &&
      (options.f_block_size == 9)) {
     return std::make_unique<PartitionedMatrixView<2,3, 9>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 2) &&
      (options.e_block_size == 3)) {
     return std::make_unique<PartitionedMatrixView<2,3, Eigen::Dynamic>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 2) &&
      (options.e_block_size == 4) &&
      (options.f_block_size == 3)) {
     return std::make_unique<PartitionedMatrixView<2,4, 3>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 2) &&
      (options.e_block_size == 4) &&
      (options.f_block_size == 4)) {
     return std::make_unique<PartitionedMatrixView<2,4, 4>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 2) &&
      (options.e_block_size == 4) &&
      (options.f_block_size == 6)) {
     return std::make_unique<PartitionedMatrixView<2,4, 6>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 2) &&
      (options.e_block_size == 4) &&
      (options.f_block_size == 8)) {
     return std::make_unique<PartitionedMatrixView<2,4, 8>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 2) &&
      (options.e_block_size == 4) &&
      (options.f_block_size == 9)) {
     return std::make_unique<PartitionedMatrixView<2,4, 9>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 2) &&
      (options.e_block_size == 4)) {
     return std::make_unique<PartitionedMatrixView<2,4, Eigen::Dynamic>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if (options.row_block_size == 2) {
     return std::make_unique<PartitionedMatrixView<2,Eigen::Dynamic, Eigen::Dynamic>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 3) &&
      (options.e_block_size == 3) &&
      (options.f_block_size == 3)) {
     return std::make_unique<PartitionedMatrixView<3,3, 3>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 4) &&
      (options.e_block_size == 4) &&
      (options.f_block_size == 2)) {
     return std::make_unique<PartitionedMatrixView<4,4, 2>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 4) &&
      (options.e_block_size == 4) &&
      (options.f_block_size == 3)) {
     return std::make_unique<PartitionedMatrixView<4,4, 3>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 4) &&
      (options.e_block_size == 4) &&
      (options.f_block_size == 4)) {
     return std::make_unique<PartitionedMatrixView<4,4, 4>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
   if ((options.row_block_size == 4) &&
      (options.e_block_size == 4)) {
     return std::make_unique<PartitionedMatrixView<4,4, Eigen::Dynamic>>(
-                   matrix, options.elimination_groups[0]);
+                   options, matrix);
   }
 
 #endif
@@ -179,7 +179,7 @@
   return std::make_unique<PartitionedMatrixView<Eigen::Dynamic,
                                                 Eigen::Dynamic,
                                                 Eigen::Dynamic>>(
-      matrix, options.elimination_groups[0]);
+      options, matrix);
 };
 
 }  // namespace ceres::internal

diff --git a/internal/ceres/partitioned_matrix_view.h b/internal/ceres/partitioned_matrix_view.h
index 533a0e5..f34fd4a 100644
--- a/internal/ceres/partitioned_matrix_view.h
+++ b/internal/ceres/partitioned_matrix_view.h

@@ -70,33 +70,25 @@
 
   // y += E'x
   virtual void LeftMultiplyAndAccumulateE(const double* x, double* y) const = 0;
-  virtual void LeftMultiplyAndAccumulateE(const double* x,
-                                          double* y,
-                                          ContextImpl* context,
-                                          int num_threads) const = 0;
+  virtual void LeftMultiplyAndAccumulateESingleThreaded(const double* x,
+                                                        double* y) const = 0;
+  virtual void LeftMultiplyAndAccumulateEMultiThreaded(const double* x,
+                                                       double* y) const = 0;
 
   // y += F'x
   virtual void LeftMultiplyAndAccumulateF(const double* x, double* y) const = 0;
-  virtual void LeftMultiplyAndAccumulateF(const double* x,
-                                          double* y,
-                                          ContextImpl* context,
-                                          int num_threads) const = 0;
+  virtual void LeftMultiplyAndAccumulateFSingleThreaded(const double* x,
+                                                        double* y) const = 0;
+  virtual void LeftMultiplyAndAccumulateFMultiThreaded(const double* x,
+                                                       double* y) const = 0;
 
   // y += Ex
   virtual void RightMultiplyAndAccumulateE(const double* x,
                                            double* y) const = 0;
-  virtual void RightMultiplyAndAccumulateE(const double* x,
-                                           double* y,
-                                           ContextImpl* context,
-                                           int num_threads) const = 0;
 
   // y += Fx
   virtual void RightMultiplyAndAccumulateF(const double* x,
                                            double* y) const = 0;
-  virtual void RightMultiplyAndAccumulateF(const double* x,
-                                           double* y,
-                                           ContextImpl* context,
-                                           int num_threads) const = 0;
 
   // Create and return the block diagonal of the matrix E'E.
   virtual std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalEtE() const = 0;
@@ -128,6 +120,8 @@
   virtual int num_cols_f()       const = 0;
   virtual int num_rows()         const = 0;
   virtual int num_cols()         const = 0;
+  virtual const std::vector<int>& e_cols_partition() const = 0;
+  virtual const std::vector<int>& f_cols_partition() const = 0;
   // clang-format on
 
   static std::unique_ptr<PartitionedMatrixViewBase> Create(
@@ -141,29 +135,34 @@
     : public PartitionedMatrixViewBase {
  public:
   // matrix = [E F], where the matrix E contains the first
-  // num_col_blocks_a column blocks.
-  PartitionedMatrixView(const BlockSparseMatrix& matrix, int num_col_blocks_e);
+  // options.elimination_groups[0] column blocks.
+  PartitionedMatrixView(const LinearSolver::Options& options,
+                        const BlockSparseMatrix& matrix);
 
-  void LeftMultiplyAndAccumulateE(const double* x, double* y) const final;
-  void LeftMultiplyAndAccumulateF(const double* x, double* y) const final;
-  void LeftMultiplyAndAccumulateE(const double* x,
-                                  double* y,
-                                  ContextImpl* context,
-                                  int num_threads) const final;
-  void LeftMultiplyAndAccumulateF(const double* x,
-                                  double* y,
-                                  ContextImpl* context,
-                                  int num_threads) const final;
-  void RightMultiplyAndAccumulateE(const double* x, double* y) const final;
-  void RightMultiplyAndAccumulateF(const double* x, double* y) const final;
-  void RightMultiplyAndAccumulateE(const double* x,
-                                   double* y,
-                                   ContextImpl* context,
-                                   int num_threads) const final;
-  void RightMultiplyAndAccumulateF(const double* x,
-                                   double* y,
-                                   ContextImpl* context,
-                                   int num_threads) const final;
+  // y += E'x
+  virtual void LeftMultiplyAndAccumulateE(const double* x,
+                                          double* y) const final;
+  virtual void LeftMultiplyAndAccumulateESingleThreaded(const double* x,
+                                                        double* y) const final;
+  virtual void LeftMultiplyAndAccumulateEMultiThreaded(const double* x,
+                                                       double* y) const final;
+
+  // y += F'x
+  virtual void LeftMultiplyAndAccumulateF(const double* x,
+                                          double* y) const final;
+  virtual void LeftMultiplyAndAccumulateFSingleThreaded(const double* x,
+                                                        double* y) const final;
+  virtual void LeftMultiplyAndAccumulateFMultiThreaded(const double* x,
+                                                       double* y) const final;
+
+  // y += Ex
+  virtual void RightMultiplyAndAccumulateE(const double* x,
+                                           double* y) const final;
+
+  // y += Fx
+  virtual void RightMultiplyAndAccumulateF(const double* x,
+                                           double* y) const final;
+
   std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalEtE() const final;
   std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalFtF() const final;
   void UpdateBlockDiagonalEtE(BlockSparseMatrix* block_diagonal) const final;
@@ -176,17 +175,26 @@
   int num_rows()         const final { return matrix_.num_rows(); }
   int num_cols()         const final { return matrix_.num_cols(); }
   // clang-format on
+  const std::vector<int>& e_cols_partition() const final {
+    return e_cols_partition_;
+  }
+  const std::vector<int>& f_cols_partition() const final {
+    return f_cols_partition_;
+  }
 
  private:
   std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalMatrixLayout(
       int start_col_block, int end_col_block) const;
 
+  const LinearSolver::Options options_;
   const BlockSparseMatrix& matrix_;
   int num_row_blocks_e_;
   int num_col_blocks_e_;
   int num_col_blocks_f_;
   int num_cols_e_;
   int num_cols_f_;
+  std::vector<int> e_cols_partition_;
+  std::vector<int> f_cols_partition_;
 };
 
 }  // namespace ceres::internal

diff --git a/internal/ceres/partitioned_matrix_view_benchmark.cc b/internal/ceres/partitioned_matrix_view_benchmark.cc
deleted file mode 100644
index 63d0e34..0000000
--- a/internal/ceres/partitioned_matrix_view_benchmark.cc
+++ /dev/null

@@ -1,147 +0,0 @@
-// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
-// http://ceres-solver.org/
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// * Neither the name of Google Inc. nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-
-#include <memory>
-#include <random>
-
-#include "benchmark/benchmark.h"
-#include "ceres/context_impl.h"
-#include "ceres/fake_bundle_adjustment_jacobian.h"
-#include "ceres/partitioned_matrix_view.h"
-
-constexpr int kNumCameras = 1000;
-constexpr int kNumPoints = 10000;
-constexpr int kCameraSize = 6;
-constexpr int kPointSize = 3;
-constexpr double kVisibility = 0.1;
-
-namespace ceres::internal {
-
-static void BM_PatitionedViewRightMultiplyAndAccumulateE_Static(
-    benchmark::State& state) {
-  const int num_threads = state.range(0);
-  std::mt19937 prng;
-  auto [partitioned_view, jacobian] =
-      CreateFakeBundleAdjustmentPartitionedJacobian<kPointSize, kCameraSize>(
-          kNumCameras, kNumPoints, kVisibility, prng);
-
-  ContextImpl context;
-  context.EnsureMinimumThreads(num_threads);
-
-  Vector x(partitioned_view->num_cols_e());
-  Vector y(partitioned_view->num_rows());
-  x.setRandom();
-  y.setRandom();
-  double sum = 0;
-  for (auto _ : state) {
-    partitioned_view->RightMultiplyAndAccumulateE(
-        x.data(), y.data(), &context, num_threads);
-    sum += y.norm();
-  }
-  CHECK_NE(sum, 0.0);
-}
-BENCHMARK(BM_PatitionedViewRightMultiplyAndAccumulateE_Static)
-    ->Arg(1)
-    ->Arg(2)
-    ->Arg(4)
-    ->Arg(8)
-    ->Arg(16);
-
-static void BM_PatitionedViewRightMultiplyAndAccumulateE_Dynamic(
-    benchmark::State& state) {
-  std::mt19937 prng;
-  auto [partitioned_view, jacobian] =
-      CreateFakeBundleAdjustmentPartitionedJacobian(
-          kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
-
-  Vector x(partitioned_view->num_cols_e());
-  Vector y(partitioned_view->num_rows());
-  x.setRandom();
-  y.setRandom();
-  double sum = 0;
-  for (auto _ : state) {
-    partitioned_view->RightMultiplyAndAccumulateE(x.data(), y.data());
-    sum += y.norm();
-  }
-  CHECK_NE(sum, 0.0);
-}
-BENCHMARK(BM_PatitionedViewRightMultiplyAndAccumulateE_Dynamic);
-
-static void BM_PatitionedViewRightMultiplyAndAccumulateF_Static(
-    benchmark::State& state) {
-  const int num_threads = state.range(0);
-  std::mt19937 prng;
-  auto [partitioned_view, jacobian] =
-      CreateFakeBundleAdjustmentPartitionedJacobian<kPointSize, kCameraSize>(
-          kNumCameras, kNumPoints, kVisibility, prng);
-
-  ContextImpl context;
-  context.EnsureMinimumThreads(num_threads);
-
-  Vector x(partitioned_view->num_cols_f());
-  Vector y(partitioned_view->num_rows());
-  x.setRandom();
-  y.setRandom();
-  double sum = 0;
-  for (auto _ : state) {
-    partitioned_view->RightMultiplyAndAccumulateF(
-        x.data(), y.data(), &context, num_threads);
-    sum += y.norm();
-  }
-  CHECK_NE(sum, 0.0);
-}
-BENCHMARK(BM_PatitionedViewRightMultiplyAndAccumulateF_Static)
-    ->Arg(1)
-    ->Arg(2)
-    ->Arg(4)
-    ->Arg(8)
-    ->Arg(16);
-
-static void BM_PatitionedViewRightMultiplyAndAccumulateF_Dynamic(
-    benchmark::State& state) {
-  std::mt19937 prng;
-  auto [partitioned_view, jacobian] =
-      CreateFakeBundleAdjustmentPartitionedJacobian(
-          kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
-
-  Vector x(partitioned_view->num_cols_f());
-  Vector y(partitioned_view->num_rows());
-  x.setRandom();
-  y.setRandom();
-  double sum = 0;
-  for (auto _ : state) {
-    partitioned_view->RightMultiplyAndAccumulateF(x.data(), y.data());
-    sum += y.norm();
-  }
-  CHECK_NE(sum, 0.0);
-}
-BENCHMARK(BM_PatitionedViewRightMultiplyAndAccumulateF_Dynamic);
-
-}  // namespace ceres::internal
-
-BENCHMARK_MAIN();

diff --git a/internal/ceres/partitioned_matrix_view_impl.h b/internal/ceres/partitioned_matrix_view_impl.h
index 689f09e..1ca2e14 100644
--- a/internal/ceres/partitioned_matrix_view_impl.h
+++ b/internal/ceres/partitioned_matrix_view_impl.h

@@ -45,11 +45,14 @@
 
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
-    PartitionedMatrixView(const BlockSparseMatrix& matrix, int num_col_blocks_e)
-    : matrix_(matrix), num_col_blocks_e_(num_col_blocks_e) {
+    PartitionedMatrixView(const LinearSolver::Options& options,
+                          const BlockSparseMatrix& matrix)
+
+    : options_(options), matrix_(matrix) {
   const CompressedRowBlockStructure* bs = matrix_.block_structure();
   CHECK(bs != nullptr);
 
+  num_col_blocks_e_ = options_.elimination_groups[0];
   num_col_blocks_f_ = bs->cols.size() - num_col_blocks_e_;
 
   // Compute the number of row blocks in E. The number of row blocks
@@ -79,6 +82,25 @@
   }
 
   CHECK_EQ(num_cols_e_ + num_cols_f_, matrix_.num_cols());
+
+  auto transpose_bs = matrix_.transpose_block_structure();
+  const int num_threads = options_.num_threads;
+  if (transpose_bs != nullptr && num_threads > 1) {
+    int kMaxPartitions = num_threads * 4;
+    e_cols_partition_ = parallel_for_details::ComputePartition(
+        0,
+        num_col_blocks_e_,
+        kMaxPartitions,
+        transpose_bs->rows.data(),
+        [](const CompressedRow& row) { return row.cumulative_nnz; });
+
+    f_cols_partition_ = parallel_for_details::ComputePartition(
+        num_col_blocks_e_,
+        num_col_blocks_e_ + num_col_blocks_f_,
+        kMaxPartitions,
+        transpose_bs->rows.data(),
+        [](const CompressedRow& row) { return row.cumulative_nnz; });
+  }
 }
 
 // The next four methods don't seem to be particularly cache
@@ -89,23 +111,14 @@
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
     RightMultiplyAndAccumulateE(const double* x, double* y) const {
-  RightMultiplyAndAccumulateE(x, y, nullptr, 1);
-}
-
-template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
-void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
-    RightMultiplyAndAccumulateE(const double* x,
-                                double* y,
-                                ContextImpl* context,
-                                int num_threads) const {
   // Iterate over the first num_row_blocks_e_ row blocks, and multiply
   // by the first cell in each row block.
   auto bs = matrix_.block_structure();
   const double* values = matrix_.values();
-  ParallelFor(context,
+  ParallelFor(options_.context,
               0,
               num_row_blocks_e_,
-              num_threads,
+              options_.num_threads,
               [values, bs, x, y](int row_block_id) {
                 const Cell& cell = bs->rows[row_block_id].cells[0];
                 const int row_block_pos = bs->rows[row_block_id].block.position;
@@ -125,15 +138,6 @@
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
     RightMultiplyAndAccumulateF(const double* x, double* y) const {
-  RightMultiplyAndAccumulateF(x, y, nullptr, 1);
-}
-
-template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
-void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
-    RightMultiplyAndAccumulateF(const double* x,
-                                double* y,
-                                ContextImpl* context,
-                                int num_threads) const {
   // Iterate over row blocks, and if the row block is in E, then
   // multiply by all the cells except the first one which is of type
   // E. If the row block is not in E (i.e its in the bottom
@@ -143,10 +147,10 @@
   const int num_row_blocks = bs->rows.size();
   const int num_cols_e = num_cols_e_;
   const double* values = matrix_.values();
-  ParallelFor(context,
+  ParallelFor(options_.context,
               0,
               num_row_blocks_e_,
-              num_threads,
+              options_.num_threads,
               [values, bs, num_cols_e, x, y](int row_block_id) {
                 const int row_block_pos = bs->rows[row_block_id].block.position;
                 const int row_block_size = bs->rows[row_block_id].block.size;
@@ -163,10 +167,10 @@
                   // clang-format on
                 }
               });
-  ParallelFor(context,
+  ParallelFor(options_.context,
               num_row_blocks_e_,
               num_row_blocks,
-              num_threads,
+              options_.num_threads,
               [values, bs, num_cols_e, x, y](int row_block_id) {
                 const int row_block_pos = bs->rows[row_block_id].block.position;
                 const int row_block_size = bs->rows[row_block_id].block.size;
@@ -190,7 +194,17 @@
     LeftMultiplyAndAccumulateE(const double* x, double* y) const {
   if (!num_col_blocks_e_) return;
   if (!num_row_blocks_e_) return;
+  if (options_.num_threads == 1) {
+    LeftMultiplyAndAccumulateESingleThreaded(x, y);
+  } else {
+    CHECK(options_.context != nullptr);
+    LeftMultiplyAndAccumulateEMultiThreaded(x, y);
+  }
+}
 
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    LeftMultiplyAndAccumulateESingleThreaded(const double* x, double* y) const {
   const CompressedRowBlockStructure* bs = matrix_.block_structure();
 
   // Iterate over the first num_row_blocks_e_ row blocks, and multiply
@@ -214,28 +228,19 @@
 
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
-    LeftMultiplyAndAccumulateE(const double* x,
-                               double* y,
-                               ContextImpl* context,
-                               int num_threads) const {
-  if (!num_col_blocks_e_) return;
-  if (!num_row_blocks_e_) return;
-
+    LeftMultiplyAndAccumulateEMultiThreaded(const double* x, double* y) const {
   auto transpose_bs = matrix_.transpose_block_structure();
-  if (transpose_bs == nullptr || num_threads == 1) {
-    LeftMultiplyAndAccumulateE(x, y);
-    return;
-  }
+  CHECK(transpose_bs != nullptr);
 
   // Local copies of class members in order to avoid capturing pointer to the
   // whole object in lambda function
   auto values = matrix_.values();
   const int num_row_blocks_e = num_row_blocks_e_;
   ParallelFor(
-      context,
+      options_.context,
       0,
       num_col_blocks_e_,
-      num_threads,
+      options_.num_threads,
       [values, transpose_bs, num_row_blocks_e, x, y](int row_block_id) {
         int row_block_pos = transpose_bs->rows[row_block_id].block.position;
         int row_block_size = transpose_bs->rows[row_block_id].block.size;
@@ -254,14 +259,24 @@
               y + row_block_pos);
         }
       },
-      transpose_bs->rows.data(),
-      [](const CompressedRow& row) { return row.cumulative_nnz; });
+      e_cols_partition());
 }
 
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
     LeftMultiplyAndAccumulateF(const double* x, double* y) const {
   if (!num_col_blocks_f_) return;
+  if (options_.num_threads == 1) {
+    LeftMultiplyAndAccumulateFSingleThreaded(x, y);
+  } else {
+    CHECK(options_.context != nullptr);
+    LeftMultiplyAndAccumulateFMultiThreaded(x, y);
+  }
+}
+
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    LeftMultiplyAndAccumulateFSingleThreaded(const double* x, double* y) const {
   const CompressedRowBlockStructure* bs = matrix_.block_structure();
 
   // Iterate over row blocks, and if the row block is in E, then
@@ -307,26 +322,19 @@
 
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
-    LeftMultiplyAndAccumulateF(const double* x,
-                               double* y,
-                               ContextImpl* context,
-                               int num_threads) const {
-  if (!num_col_blocks_f_) return;
+    LeftMultiplyAndAccumulateFMultiThreaded(const double* x, double* y) const {
   auto transpose_bs = matrix_.transpose_block_structure();
-  if (transpose_bs == nullptr || num_threads == 1) {
-    LeftMultiplyAndAccumulateF(x, y);
-    return;
-  }
+  CHECK(transpose_bs != nullptr);
   // Local copies of class members  in order to avoid capturing pointer to the
   // whole object in lambda function
   auto values = matrix_.values();
   const int num_row_blocks_e = num_row_blocks_e_;
   const int num_cols_e = num_cols_e_;
   ParallelFor(
-      context,
+      options_.context,
       num_col_blocks_e_,
       num_col_blocks_e_ + num_col_blocks_f_,
-      num_threads,
+      options_.num_threads,
       [values, transpose_bs, num_row_blocks_e, num_cols_e, x, y](
           int row_block_id) {
         int row_block_pos = transpose_bs->rows[row_block_id].block.position;
@@ -362,8 +370,7 @@
               y + row_block_pos - num_cols_e);
         }
       },
-      transpose_bs->rows.data(),
-      [](const CompressedRow& row) { return row.cumulative_nnz; });
+      f_cols_partition());
 }
 
 // Given a range of columns blocks of a matrix m, compute the block

diff --git a/internal/ceres/partitioned_matrix_view_template.py b/internal/ceres/partitioned_matrix_view_template.py
index 27f368f..8978d63 100644
--- a/internal/ceres/partitioned_matrix_view_template.py
+++ b/internal/ceres/partitioned_matrix_view_template.py

@@ -132,7 +132,7 @@
 #ifndef CERES_RESTRICT_SCHUR_SPECIALIZATION
 """
 FACTORY = """  return std::make_unique<PartitionedMatrixView<%s,%s, %s>>(
-                   matrix, options.elimination_groups[0]);"""
+                   options, matrix);"""
 
 FACTORY_FOOTER = """
 #endif
@@ -142,7 +142,7 @@
   return std::make_unique<PartitionedMatrixView<Eigen::Dynamic,
                                                 Eigen::Dynamic,
                                                 Eigen::Dynamic>>(
-      matrix, options.elimination_groups[0]);
+      options, matrix);
 };
 
 }  // namespace ceres::internal

diff --git a/internal/ceres/partitioned_matrix_view_test.cc b/internal/ceres/partitioned_matrix_view_test.cc
index 4653e06..fc1ebdd 100644
--- a/internal/ceres/partitioned_matrix_view_test.cc
+++ b/internal/ceres/partitioned_matrix_view_test.cc

@@ -76,76 +76,6 @@
       std::uniform_real_distribution<double>(0.0, 1.0);
 };
 
-TEST_F(PartitionedMatrixViewTest, DimensionsTest) {
-  EXPECT_EQ(pmv_->num_col_blocks_e(), num_eliminate_blocks_);
-  EXPECT_EQ(pmv_->num_col_blocks_f(), num_cols_ - num_eliminate_blocks_);
-  EXPECT_EQ(pmv_->num_cols_e(), num_eliminate_blocks_);
-  EXPECT_EQ(pmv_->num_cols_f(), num_cols_ - num_eliminate_blocks_);
-  EXPECT_EQ(pmv_->num_cols(), A_->num_cols());
-  EXPECT_EQ(pmv_->num_rows(), A_->num_rows());
-}
-
-TEST_F(PartitionedMatrixViewTest, RightMultiplyAndAccumulateE) {
-  Vector x1(pmv_->num_cols_e());
-  Vector x2(pmv_->num_cols());
-  x2.setZero();
-
-  for (int i = 0; i < pmv_->num_cols_e(); ++i) {
-    x1(i) = x2(i) = RandDouble();
-  }
-
-  Vector y1 = Vector::Zero(pmv_->num_rows());
-  pmv_->RightMultiplyAndAccumulateE(x1.data(), y1.data());
-
-  Vector y2 = Vector::Zero(pmv_->num_rows());
-  A_->RightMultiplyAndAccumulate(x2.data(), y2.data());
-
-  for (int i = 0; i < pmv_->num_rows(); ++i) {
-    EXPECT_NEAR(y1(i), y2(i), kEpsilon);
-  }
-}
-
-TEST_F(PartitionedMatrixViewTest, RightMultiplyAndAccumulateF) {
-  Vector x1(pmv_->num_cols_f());
-  Vector x2 = Vector::Zero(pmv_->num_cols());
-
-  for (int i = 0; i < pmv_->num_cols_f(); ++i) {
-    x1(i) = RandDouble();
-    x2(i + pmv_->num_cols_e()) = x1(i);
-  }
-
-  Vector y1 = Vector::Zero(pmv_->num_rows());
-  pmv_->RightMultiplyAndAccumulateF(x1.data(), y1.data());
-
-  Vector y2 = Vector::Zero(pmv_->num_rows());
-  A_->RightMultiplyAndAccumulate(x2.data(), y2.data());
-
-  for (int i = 0; i < pmv_->num_rows(); ++i) {
-    EXPECT_NEAR(y1(i), y2(i), kEpsilon);
-  }
-}
-
-TEST_F(PartitionedMatrixViewTest, LeftMultiplyAndAccumulate) {
-  Vector x = Vector::Zero(pmv_->num_rows());
-  for (int i = 0; i < pmv_->num_rows(); ++i) {
-    x(i) = RandDouble();
-  }
-
-  Vector y = Vector::Zero(pmv_->num_cols());
-  Vector y1 = Vector::Zero(pmv_->num_cols_e());
-  Vector y2 = Vector::Zero(pmv_->num_cols_f());
-
-  A_->LeftMultiplyAndAccumulate(x.data(), y.data());
-  pmv_->LeftMultiplyAndAccumulateE(x.data(), y1.data());
-  pmv_->LeftMultiplyAndAccumulateF(x.data(), y2.data());
-
-  for (int i = 0; i < pmv_->num_cols(); ++i) {
-    EXPECT_NEAR(y(i),
-                (i < pmv_->num_cols_e()) ? y1(i) : y2(i - pmv_->num_cols_e()),
-                kEpsilon);
-  }
-}
-
 TEST_F(PartitionedMatrixViewTest, BlockDiagonalEtE) {
   std::unique_ptr<BlockSparseMatrix> block_diagonal_ee(
       pmv_->CreateBlockDiagonalEtE());
@@ -174,134 +104,122 @@
   EXPECT_NEAR(block_diagonal_ff->values()[2], 37.0, kEpsilon);
 }
 
-const int kMaxNumThreads = 8;
-class PartitionedMatrixViewParallelTest : public ::testing::TestWithParam<int> {
+// Param = <problem_id, num_threads>
+using Param = ::testing::tuple<int, int>;
+
+static std::string ParamInfoToString(testing::TestParamInfo<Param> info) {
+  Param param = info.param;
+  std::stringstream ss;
+  ss << ::testing::get<0>(param) << "_" << ::testing::get<1>(param);
+  return ss.str();
+}
+
+class PartitionedMatrixViewSpMVTest : public ::testing::TestWithParam<Param> {
  protected:
-  static const int kNumProblems = 3;
   void SetUp() final {
-    int problem_ids[kNumProblems] = {2, 4, 6};
-    for (int i = 0; i < kNumProblems; ++i) {
-      auto problem = CreateLinearLeastSquaresProblemFromId(problem_ids[i]);
-      CHECK(problem != nullptr);
-      SetUpMatrix(i, problem.get());
-    }
-
-    context_.EnsureMinimumThreads(kMaxNumThreads);
-  }
-
-  void SetUpMatrix(int id, LinearLeastSquaresProblem* problem) {
-    A_[id] = std::move(problem->A);
-    auto& A = A_[id];
-    auto block_sparse = down_cast<BlockSparseMatrix*>(A.get());
+    const int problem_id = ::testing::get<0>(GetParam());
+    const int num_threads = ::testing::get<1>(GetParam());
+    auto problem = CreateLinearLeastSquaresProblemFromId(problem_id);
+    CHECK(problem != nullptr);
+    A_ = std::move(problem->A);
+    auto block_sparse = down_cast<BlockSparseMatrix*>(A_.get());
     block_sparse->AddTransposeBlockStructure();
 
-    num_cols_[id] = A->num_cols();
-    num_rows_[id] = A->num_rows();
-    num_eliminate_blocks_[id] = problem->num_eliminate_blocks;
-    LinearSolver::Options options;
-    options.elimination_groups.push_back(num_eliminate_blocks_[id]);
-    pmv_[id] = PartitionedMatrixViewBase::Create(options, *block_sparse);
+    options_.num_threads = num_threads;
+    options_.context = &context_;
+    options_.elimination_groups.push_back(problem->num_eliminate_blocks);
+    pmv_ = PartitionedMatrixViewBase::Create(options_, *block_sparse);
+
+    EXPECT_EQ(pmv_->num_col_blocks_e(), problem->num_eliminate_blocks);
+    EXPECT_EQ(pmv_->num_col_blocks_f(),
+              block_sparse->block_structure()->cols.size() -
+                  problem->num_eliminate_blocks);
+    EXPECT_EQ(pmv_->num_cols(), A_->num_cols());
+    EXPECT_EQ(pmv_->num_rows(), A_->num_rows());
   }
 
   double RandDouble() { return distribution_(prng_); }
 
+  LinearSolver::Options options_;
   ContextImpl context_;
-  int num_rows_[kNumProblems];
-  int num_cols_[kNumProblems];
-  int num_eliminate_blocks_[kNumProblems];
-  std::unique_ptr<SparseMatrix> A_[kNumProblems];
-  std::unique_ptr<PartitionedMatrixViewBase> pmv_[kNumProblems];
+  std::unique_ptr<LinearLeastSquaresProblem> problem_;
+  std::unique_ptr<SparseMatrix> A_;
+  std::unique_ptr<PartitionedMatrixViewBase> pmv_;
+  int num_cols_e;
   std::mt19937 prng_;
   std::uniform_real_distribution<double> distribution_ =
       std::uniform_real_distribution<double>(0.0, 1.0);
 };
 
-TEST_P(PartitionedMatrixViewParallelTest, RightMultiplyAndAccumulateEParallel) {
-  const int kNumThreads = GetParam();
-  for (int p = 0; p < kNumProblems; ++p) {
-    auto& pmv = pmv_[p];
-    auto& A = A_[p];
+TEST_P(PartitionedMatrixViewSpMVTest, RightMultiplyAndAccumulateE) {
+  Vector x1(pmv_->num_cols_e());
+  Vector x2(pmv_->num_cols());
+  x2.setZero();
 
-    Vector x1(pmv->num_cols_e());
-    Vector x2(pmv->num_cols());
-    x2.setZero();
+  for (int i = 0; i < pmv_->num_cols_e(); ++i) {
+    x1(i) = x2(i) = RandDouble();
+  }
 
-    for (int i = 0; i < pmv->num_cols_e(); ++i) {
-      x1(i) = x2(i) = RandDouble();
-    }
+  Vector expected = Vector::Zero(pmv_->num_rows());
+  A_->RightMultiplyAndAccumulate(x2.data(), expected.data());
 
-    Vector y1 = Vector::Zero(pmv->num_rows());
-    pmv->RightMultiplyAndAccumulateE(
-        x1.data(), y1.data(), &context_, kNumThreads);
+  Vector actual = Vector::Zero(pmv_->num_rows());
+  pmv_->RightMultiplyAndAccumulateE(x1.data(), actual.data());
 
-    Vector y2 = Vector::Zero(pmv->num_rows());
-    A->RightMultiplyAndAccumulate(x2.data(), y2.data());
-
-    for (int i = 0; i < pmv->num_rows(); ++i) {
-      EXPECT_NEAR(y1(i), y2(i), kEpsilon);
-    }
+  for (int i = 0; i < pmv_->num_rows(); ++i) {
+    EXPECT_NEAR(actual(i), expected(i), kEpsilon);
   }
 }
 
-TEST_P(PartitionedMatrixViewParallelTest, RightMultiplyAndAccumulateFParallel) {
-  const int kNumThreads = GetParam();
-  for (int p = 0; p < kNumProblems; ++p) {
-    auto& pmv = pmv_[p];
-    auto& A = A_[p];
-    Vector x1(pmv->num_cols_f());
-    Vector x2(pmv->num_cols());
-    x2.setZero();
+TEST_P(PartitionedMatrixViewSpMVTest, RightMultiplyAndAccumulateF) {
+  Vector x1(pmv_->num_cols_f());
+  Vector x2(pmv_->num_cols());
+  x2.setZero();
 
-    for (int i = 0; i < pmv->num_cols_f(); ++i) {
-      x1(i) = x2(i + pmv->num_cols_e()) = RandDouble();
-    }
+  for (int i = 0; i < pmv_->num_cols_f(); ++i) {
+    x1(i) = x2(i + pmv_->num_cols_e()) = RandDouble();
+  }
 
-    Vector y1 = Vector::Zero(pmv->num_rows());
-    pmv->RightMultiplyAndAccumulateF(
-        x1.data(), y1.data(), &context_, kNumThreads);
+  Vector actual = Vector::Zero(pmv_->num_rows());
+  pmv_->RightMultiplyAndAccumulateF(x1.data(), actual.data());
 
-    Vector y2 = Vector::Zero(pmv->num_rows());
-    A->RightMultiplyAndAccumulate(x2.data(), y2.data());
+  Vector expected = Vector::Zero(pmv_->num_rows());
+  A_->RightMultiplyAndAccumulate(x2.data(), expected.data());
 
-    for (int i = 0; i < pmv->num_rows(); ++i) {
-      EXPECT_NEAR(y1(i), y2(i), kEpsilon);
-    }
+  for (int i = 0; i < pmv_->num_rows(); ++i) {
+    EXPECT_NEAR(actual(i), expected(i), kEpsilon);
   }
 }
 
-TEST_P(PartitionedMatrixViewParallelTest, LeftMultiplyAndAccumulateParallel) {
-  const int kNumThreads = GetParam();
-  for (int p = 0; p < kNumProblems; ++p) {
-    auto& pmv = pmv_[p];
-    auto& A = A_[p];
-    Vector x = Vector::Zero(pmv->num_rows());
-    for (int i = 0; i < pmv->num_rows(); ++i) {
-      x(i) = RandDouble();
-    }
-    Vector x_pre = x;
+TEST_P(PartitionedMatrixViewSpMVTest, LeftMultiplyAndAccumulate) {
+  Vector x = Vector::Zero(pmv_->num_rows());
+  for (int i = 0; i < pmv_->num_rows(); ++i) {
+    x(i) = RandDouble();
+  }
+  Vector x_pre = x;
 
-    Vector y = Vector::Zero(pmv->num_cols());
-    Vector y1 = Vector::Zero(pmv->num_cols_e());
-    Vector y2 = Vector::Zero(pmv->num_cols_f());
+  Vector expected = Vector::Zero(pmv_->num_cols());
+  Vector e_actual = Vector::Zero(pmv_->num_cols_e());
+  Vector f_actual = Vector::Zero(pmv_->num_cols_f());
 
-    A->LeftMultiplyAndAccumulate(x.data(), y.data());
-    pmv->LeftMultiplyAndAccumulateE(
-        x.data(), y1.data(), &context_, kNumThreads);
-    pmv->LeftMultiplyAndAccumulateF(
-        x.data(), y2.data(), &context_, kNumThreads);
+  A_->LeftMultiplyAndAccumulate(x.data(), expected.data());
+  pmv_->LeftMultiplyAndAccumulateE(x.data(), e_actual.data());
+  pmv_->LeftMultiplyAndAccumulateF(x.data(), f_actual.data());
 
-    for (int i = 0; i < pmv->num_cols(); ++i) {
-      EXPECT_NEAR(y(i),
-                  (i < pmv->num_cols_e()) ? y1(i) : y2(i - pmv->num_cols_e()),
-                  kEpsilon);
-    }
+  for (int i = 0; i < pmv_->num_cols(); ++i) {
+    EXPECT_NEAR(expected(i),
+                (i < pmv_->num_cols_e()) ? e_actual(i)
+                                         : f_actual(i - pmv_->num_cols_e()),
+                kEpsilon);
   }
 }
 
-INSTANTIATE_TEST_SUITE_P(ParallelProducts,
-                         PartitionedMatrixViewParallelTest,
-                         ::testing::Values(1, 2, 4, 8),
-                         ::testing::PrintToStringParamName());
+INSTANTIATE_TEST_SUITE_P(
+    ParallelProducts,
+    PartitionedMatrixViewSpMVTest,
+    ::testing::Combine(::testing::Values(2, 4, 6),
+                       ::testing::Values(1, 2, 3, 4, 5, 6, 7, 8)),
+    ParamInfoToString);
 
 }  // namespace internal
 }  // namespace ceres