Parallel right products for partitioned view

Parallel implementations for right-multiply by dense vector for:
 - Partitioned matrix view
 - Block-sparse matrix
 - CRS matrix (non-symmetric only)

When coupled with non-interleaving indexes in parallel for, this
simple aproach provides a reasonable speedup.
For example, in CRS case difference with GPGPU approach reduces
closer to memory throughput ratio for high enough core count.

./bin/spmv_benchmark
-------------------------------------------------------------------
Benchmark                                                      Time
-------------------------------------------------------------------
BM_BlockSparseRightMultiplyAndAccumulateBA/1              28.5   ms
BM_BlockSparseRightMultiplyAndAccumulateBA/2              15.7   ms
BM_BlockSparseRightMultiplyAndAccumulateBA/4               9.01  ms
BM_BlockSparseRightMultiplyAndAccumulateBA/8               5.60  ms
BM_BlockSparseRightMultiplyAndAccumulateBA/16              3.86  ms
BM_BlockSparseRightMultiplyAndAccumulateBA/28              3.84  ms
BM_BlockSparseRightMultiplyAndAccumulateUnstructured/1    23.8   ms
BM_BlockSparseRightMultiplyAndAccumulateUnstructured/2    15.0   ms
BM_BlockSparseRightMultiplyAndAccumulateUnstructured/4     8.01  ms
BM_BlockSparseRightMultiplyAndAccumulateUnstructured/8     4.02  ms
BM_BlockSparseRightMultiplyAndAccumulateUnstructured/16    2.39  ms
BM_BlockSparseRightMultiplyAndAccumulateUnstructured/28    1.68  ms
BM_BlockSparseLeftMultiplyAndAccumulateBA                 30.7   ms
BM_BlockSparseLeftMultiplyAndAccumulateUnstructured       41.5   ms
BM_CRSRightMultiplyAndAccumulateBA/1                      24.1   ms
BM_CRSRightMultiplyAndAccumulateBA/2                      13.6   ms
BM_CRSRightMultiplyAndAccumulateBA/4                       8.70  ms
BM_CRSRightMultiplyAndAccumulateBA/8                       5.34  ms
BM_CRSRightMultiplyAndAccumulateBA/16                      3.99  ms
BM_CRSRightMultiplyAndAccumulateBA/28                      4.00  ms
BM_CRSRightMultiplyAndAccumulateUnstructured/1            21.1   ms
BM_CRSRightMultiplyAndAccumulateUnstructured/2            10.83  ms
BM_CRSRightMultiplyAndAccumulateUnstructured/4             5.88  ms
BM_CRSRightMultiplyAndAccumulateUnstructured/8             3.68  ms
BM_CRSRightMultiplyAndAccumulateUnstructured/16            2.21  ms
BM_CRSRightMultiplyAndAccumulateUnstructured/28            1.71  ms
BM_CRSLeftMultiplyAndAccumulateBA                         23.6   ms
BM_CRSLeftMultiplyAndAccumulateUnstructured               22.5   ms
BM_CudaRightMultiplyAndAccumulateBA                        0.679 ms
BM_CudaRightMultiplyAndAccumulateUnstructured              0.480 ms
BM_CudaLeftMultiplyAndAccumulateBA                         0.774 ms
BM_CudaLeftMultiplyAndAccumulateUnstructured               0.361 ms

./bin/partitioned_matrix_view_benchmark
-----------------------------------------------------------------
Benchmark                                                    Time
-----------------------------------------------------------------
BM_PatitionedViewRightMultiplyAndAccumulateE_Static/1    18.5  ms
BM_PatitionedViewRightMultiplyAndAccumulateE_Static/2    10.7  ms
BM_PatitionedViewRightMultiplyAndAccumulateE_Static/4     6.34 ms
BM_PatitionedViewRightMultiplyAndAccumulateE_Static/8     4.26 ms
BM_PatitionedViewRightMultiplyAndAccumulateE_Static/16    3.86 ms
BM_PatitionedViewRightMultiplyAndAccumulateE_Static/28    3.75 ms
BM_PatitionedViewRightMultiplyAndAccumulateF_Static/1    18.8  ms
BM_PatitionedViewRightMultiplyAndAccumulateF_Static/2    11.9  ms
BM_PatitionedViewRightMultiplyAndAccumulateF_Static/4     6.94 ms
BM_PatitionedViewRightMultiplyAndAccumulateF_Static/8     4.41 ms
BM_PatitionedViewRightMultiplyAndAccumulateF_Static/16    3.63 ms
BM_PatitionedViewRightMultiplyAndAccumulateF_Static/28    3.86 ms

Timings correspond to intel 8176 cpu and 2080ti nvidia gpu,
with OpenMP threading backend.

Change-Id: Idc07d0563103d057ca3c8412de81a7823fe232af
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index fef1c2d..350b3b5 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -598,6 +598,9 @@
   add_executable(spmv_benchmark spmv_benchmark.cc)
   add_dependencies_to_benchmark(spmv_benchmark)
 
+  add_executable(partitioned_matrix_view_benchmark partitioned_matrix_view_benchmark.cc)
+  add_dependencies_to_benchmark(partitioned_matrix_view_benchmark)
+
   add_executable(block_jacobi_preconditioner_benchmark
     block_jacobi_preconditioner_benchmark.cc)
   add_dependencies_to_benchmark(block_jacobi_preconditioner_benchmark)
diff --git a/internal/ceres/block_sparse_matrix.cc b/internal/ceres/block_sparse_matrix.cc
index 0401696..0dd3b20 100644
--- a/internal/ceres/block_sparse_matrix.cc
+++ b/internal/ceres/block_sparse_matrix.cc
@@ -39,6 +39,7 @@
 #include "ceres/block_structure.h"
 #include "ceres/crs_matrix.h"
 #include "ceres/internal/eigen.h"
+#include "ceres/parallel_for.h"
 #include "ceres/small_blas.h"
 #include "ceres/triplet_sparse_matrix.h"
 #include "glog/logging.h"
@@ -88,25 +89,44 @@
 
 void BlockSparseMatrix::RightMultiplyAndAccumulate(const double* x,
                                                    double* y) const {
+  RightMultiplyAndAccumulate(x, y, nullptr, 1);
+}
+
+void BlockSparseMatrix::RightMultiplyAndAccumulate(const double* x,
+                                                   double* y,
+                                                   ContextImpl* context,
+                                                   int num_threads) const {
   CHECK(x != nullptr);
   CHECK(y != nullptr);
 
-  for (int i = 0; i < block_structure_->rows.size(); ++i) {
-    int row_block_pos = block_structure_->rows[i].block.position;
-    int row_block_size = block_structure_->rows[i].block.size;
-    const std::vector<Cell>& cells = block_structure_->rows[i].cells;
-    for (const auto& cell : cells) {
-      int col_block_id = cell.block_id;
-      int col_block_size = block_structure_->cols[col_block_id].size;
-      int col_block_pos = block_structure_->cols[col_block_id].position;
-      MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
-          values_.get() + cell.position,
-          row_block_size,
-          col_block_size,
-          x + col_block_pos,
-          y + row_block_pos);
-    }
-  }
+  const auto values = values_.get();
+  const auto block_structure = block_structure_.get();
+  const auto num_row_blocks = block_structure->rows.size();
+
+  ParallelFor(context,
+              0,
+              num_row_blocks,
+              num_threads,
+              [values, block_structure, x, y](int row_block_id) {
+                const int row_block_pos =
+                    block_structure->rows[row_block_id].block.position;
+                const int row_block_size =
+                    block_structure->rows[row_block_id].block.size;
+                const auto& cells = block_structure->rows[row_block_id].cells;
+                for (const auto& cell : cells) {
+                  const int col_block_id = cell.block_id;
+                  const int col_block_size =
+                      block_structure->cols[col_block_id].size;
+                  const int col_block_pos =
+                      block_structure->cols[col_block_id].position;
+                  MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+                      values + cell.position,
+                      row_block_size,
+                      col_block_size,
+                      x + col_block_pos,
+                      y + row_block_pos);
+                }
+              });
 }
 
 void BlockSparseMatrix::LeftMultiplyAndAccumulate(const double* x,
diff --git a/internal/ceres/block_sparse_matrix.h b/internal/ceres/block_sparse_matrix.h
index b7ea1df..3a8dac9 100644
--- a/internal/ceres/block_sparse_matrix.h
+++ b/internal/ceres/block_sparse_matrix.h
@@ -39,6 +39,7 @@
 
 #include "ceres/block_structure.h"
 #include "ceres/compressed_row_sparse_matrix.h"
+#include "ceres/context_impl.h"
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"
@@ -66,13 +67,16 @@
   // CompressedRowBlockStructure objects.
   explicit BlockSparseMatrix(CompressedRowBlockStructure* block_structure);
 
-  BlockSparseMatrix();
   BlockSparseMatrix(const BlockSparseMatrix&) = delete;
   void operator=(const BlockSparseMatrix&) = delete;
 
   // Implementation of SparseMatrix interface.
   void SetZero() final;
   void RightMultiplyAndAccumulate(const double* x, double* y) const final;
+  void RightMultiplyAndAccumulate(const double* x,
+                                  double* y,
+                                  ContextImpl* context,
+                                  int num_threads) const final;
   void LeftMultiplyAndAccumulate(const double* x, double* y) const final;
   void SquaredColumnNorm(double* x) const final;
   void ScaleColumns(const double* scale) final;
diff --git a/internal/ceres/block_sparse_matrix_test.cc b/internal/ceres/block_sparse_matrix_test.cc
index 7d7d4c9..3f5c78a 100644
--- a/internal/ceres/block_sparse_matrix_test.cc
+++ b/internal/ceres/block_sparse_matrix_test.cc
@@ -115,6 +115,8 @@
 }
 }  // namespace
 
+const int kNumThreads = 4;
+
 class BlockSparseMatrixTest : public ::testing::Test {
  protected:
   void SetUp() final {
@@ -130,10 +132,12 @@
     CHECK_EQ(A_->num_rows(), B_->num_rows());
     CHECK_EQ(A_->num_cols(), B_->num_cols());
     CHECK_EQ(A_->num_nonzeros(), B_->num_nonzeros());
+    context_.EnsureMinimumThreads(kNumThreads);
   }
 
   std::unique_ptr<BlockSparseMatrix> A_;
   std::unique_ptr<TripletSparseMatrix> B_;
+  ContextImpl context_;
 };
 
 TEST_F(BlockSparseMatrixTest, SetZeroTest) {
@@ -153,6 +157,20 @@
   }
 }
 
+TEST_F(BlockSparseMatrixTest, RightMultiplyAndAccumulateParallelTest) {
+  Vector y_0 = Vector::Random(A_->num_rows());
+  Vector y_s = y_0;
+  Vector y_p = y_0;
+
+  Vector x = Vector::Random(A_->num_cols());
+  A_->RightMultiplyAndAccumulate(x.data(), y_s.data());
+
+  A_->RightMultiplyAndAccumulate(x.data(), y_p.data(), &context_, kNumThreads);
+
+  // Current parallel implementation is expected to be bit-exact
+  EXPECT_EQ((y_s - y_p).norm(), 0.);
+}
+
 TEST_F(BlockSparseMatrixTest, LeftMultiplyAndAccumulateTest) {
   Vector y_a = Vector::Zero(A_->num_cols());
   Vector y_b = Vector::Zero(A_->num_cols());
diff --git a/internal/ceres/cgnr_solver.cc b/internal/ceres/cgnr_solver.cc
index 17a5f62..e1c571d 100644
--- a/internal/ceres/cgnr_solver.cc
+++ b/internal/ceres/cgnr_solver.cc
@@ -84,14 +84,21 @@
 class CERES_NO_EXPORT CgnrLinearOperator final
     : public ConjugateGradientsLinearOperator<Vector> {
  public:
-  CgnrLinearOperator(const LinearOperator& A, const double* D)
-      : A_(A), D_(D), z_(Vector::Zero(A.num_rows())) {}
+  CgnrLinearOperator(const LinearOperator& A,
+                     const double* D,
+                     ContextImpl* context,
+                     int num_threads)
+      : A_(A),
+        D_(D),
+        z_(Vector::Zero(A.num_rows())),
+        context_(context),
+        num_threads_(num_threads) {}
 
   void RightMultiplyAndAccumulate(const Vector& x, Vector& y) final {
     // z = Ax
     // y = y + Atz
     z_.setZero();
-    A_.RightMultiplyAndAccumulate(x, z_);
+    A_.RightMultiplyAndAccumulate(x, z_, context_, num_threads_);
     A_.LeftMultiplyAndAccumulate(z_, y);
 
     // y = y + DtDx
@@ -105,6 +112,9 @@
   const LinearOperator& A_;
   const double* D_;
   Vector z_;
+
+  ContextImpl* context_;
+  int num_threads_;
 };
 
 CgnrSolver::CgnrSolver(LinearSolver::Options options)
@@ -166,7 +176,8 @@
   cg_options.r_tolerance = per_solve_options.r_tolerance;
 
   // lhs = AtA + DtD
-  CgnrLinearOperator lhs(*A, per_solve_options.D);
+  CgnrLinearOperator lhs(
+      *A, per_solve_options.D, options_.context, options_.num_threads);
   // rhs = Atb.
   Vector rhs(A->num_cols());
   rhs.setZero();
diff --git a/internal/ceres/compressed_row_sparse_matrix.cc b/internal/ceres/compressed_row_sparse_matrix.cc
index 057adfd..553dc21 100644
--- a/internal/ceres/compressed_row_sparse_matrix.cc
+++ b/internal/ceres/compressed_row_sparse_matrix.cc
@@ -37,8 +37,10 @@
 #include <random>
 #include <vector>
 
+#include "ceres/context_impl.h"
 #include "ceres/crs_matrix.h"
 #include "ceres/internal/export.h"
+#include "ceres/parallel_for.h"
 #include "ceres/triplet_sparse_matrix.h"
 #include "glog/logging.h"
 
@@ -278,19 +280,34 @@
 
 // TODO(sameeragarwal): Make RightMultiplyAndAccumulate and
 // LeftMultiplyAndAccumulate block-aware for higher performance.
+void CompressedRowSparseMatrix::RightMultiplyAndAccumulate(
+    const double* x, double* y, ContextImpl* context, int num_threads) const {
+  if (storage_type_ != StorageType::UNSYMMETRIC) {
+    RightMultiplyAndAccumulate(x, y);
+    return;
+  }
+
+  auto values = values_.data();
+  auto rows = rows_.data();
+  auto cols = cols_.data();
+
+  ParallelFor(
+      context, 0, num_rows_, num_threads, [values, rows, cols, x, y](int row) {
+        for (int idx = rows[row]; idx < rows[row + 1]; ++idx) {
+          const int c = cols[idx];
+          const double v = values[idx];
+          y[row] += v * x[c];
+        }
+      });
+}
+
 void CompressedRowSparseMatrix::RightMultiplyAndAccumulate(const double* x,
                                                            double* y) const {
   CHECK(x != nullptr);
   CHECK(y != nullptr);
 
   if (storage_type_ == StorageType::UNSYMMETRIC) {
-    for (int r = 0; r < num_rows_; ++r) {
-      for (int idx = rows_[r]; idx < rows_[r + 1]; ++idx) {
-        const int c = cols_[idx];
-        const double v = values_[idx];
-        y[r] += v * x[c];
-      }
-    }
+    RightMultiplyAndAccumulate(x, y, nullptr, 1);
   } else if (storage_type_ == StorageType::UPPER_TRIANGULAR) {
     // Because of their block structure, we will have entries that lie
     // above (below) the diagonal for lower (upper) triangular matrices,
diff --git a/internal/ceres/compressed_row_sparse_matrix.h b/internal/ceres/compressed_row_sparse_matrix.h
index 8668d06..6fe730e 100644
--- a/internal/ceres/compressed_row_sparse_matrix.h
+++ b/internal/ceres/compressed_row_sparse_matrix.h
@@ -48,6 +48,7 @@
 
 namespace internal {
 
+class ContextImpl;
 class TripletSparseMatrix;
 
 class CERES_NO_EXPORT CompressedRowSparseMatrix : public SparseMatrix {
@@ -103,6 +104,10 @@
   ~CompressedRowSparseMatrix() override;
   void SetZero() final;
   void RightMultiplyAndAccumulate(const double* x, double* y) const final;
+  void RightMultiplyAndAccumulate(const double* x,
+                                  double* y,
+                                  ContextImpl* context,
+                                  int num_threads) const final;
   void LeftMultiplyAndAccumulate(const double* x, double* y) const final;
   void SquaredColumnNorm(double* x) const final;
   void ScaleColumns(const double* scale) final;
diff --git a/internal/ceres/compressed_row_sparse_matrix_test.cc b/internal/ceres/compressed_row_sparse_matrix_test.cc
index 82c982b..46136db 100644
--- a/internal/ceres/compressed_row_sparse_matrix_test.cc
+++ b/internal/ceres/compressed_row_sparse_matrix_test.cc
@@ -39,6 +39,7 @@
 
 #include "Eigen/SparseCore"
 #include "ceres/casts.h"
+#include "ceres/context_impl.h"
 #include "ceres/crs_matrix.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/linear_least_squares_problems.h"
@@ -599,6 +600,73 @@
                       CompressedRowSparseMatrix::StorageType::UNSYMMETRIC),
     ParamInfoToString);
 
+const int kMaxNumThreads = 8;
+class CompressedRowSparseMatrixParallelTest
+    : public ::testing::TestWithParam<int> {
+  void SetUp() final { context_.EnsureMinimumThreads(kMaxNumThreads); }
+
+ protected:
+  ContextImpl context_;
+};
+
+TEST_P(CompressedRowSparseMatrixParallelTest,
+       RightMultiplyAndAccumulateUnsymmetric) {
+  const int kMinNumBlocks = 1;
+  const int kMaxNumBlocks = 10;
+  const int kMinBlockSize = 1;
+  const int kMaxBlockSize = 5;
+  const int kNumTrials = 10;
+  const int kNumThreads = GetParam();
+  std::mt19937 prng;
+  std::uniform_real_distribution<double> uniform(0.5, 1.0);
+  for (int num_blocks = kMinNumBlocks; num_blocks < kMaxNumBlocks;
+       ++num_blocks) {
+    for (int trial = 0; trial < kNumTrials; ++trial) {
+      CompressedRowSparseMatrix::RandomMatrixOptions options;
+      options.num_col_blocks = num_blocks;
+      options.min_col_block_size = kMinBlockSize;
+      options.max_col_block_size = kMaxBlockSize;
+      options.num_row_blocks = 2 * num_blocks;
+      options.min_row_block_size = kMinBlockSize;
+      options.max_row_block_size = kMaxBlockSize;
+      options.block_density = uniform(prng);
+      options.storage_type =
+          CompressedRowSparseMatrix::StorageType::UNSYMMETRIC;
+      auto matrix =
+          CompressedRowSparseMatrix::CreateRandomMatrix(options, prng);
+      const int num_rows = matrix->num_rows();
+      const int num_cols = matrix->num_cols();
+
+      Vector x(num_cols);
+      x.setRandom();
+
+      Vector actual_y(num_rows);
+      actual_y.setZero();
+      matrix->RightMultiplyAndAccumulate(
+          x.data(), actual_y.data(), &context_, kNumThreads);
+
+      Matrix dense;
+      matrix->ToDenseMatrix(&dense);
+      Vector expected_y = dense * x;
+
+      ASSERT_NEAR((expected_y - actual_y).norm() / actual_y.norm(),
+                  0.0,
+                  std::numeric_limits<double>::epsilon() * 10)
+          << "\n"
+          << dense << "x:\n"
+          << x.transpose() << "\n"
+          << "expected: \n"
+          << expected_y.transpose() << "\n"
+          << "actual: \n"
+          << actual_y.transpose();
+    }
+  }
+}
+INSTANTIATE_TEST_SUITE_P(ParallelProducts,
+                         CompressedRowSparseMatrixParallelTest,
+                         ::testing::Values(1, 2, 4, 8),
+                         ::testing::PrintToStringParamName());
+
 // TODO(sameeragarwal) Add tests for the random matrix creation methods.
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/fake_bundle_adjustment_jacobian.cc b/internal/ceres/fake_bundle_adjustment_jacobian.cc
index 9e7d82b..dd2aee8 100644
--- a/internal/ceres/fake_bundle_adjustment_jacobian.cc
+++ b/internal/ceres/fake_bundle_adjustment_jacobian.cc
@@ -96,4 +96,23 @@
   return jacobian;
 }
 
+std::pair<
+    std::unique_ptr<PartitionedMatrixView<2, Eigen::Dynamic, Eigen::Dynamic>>,
+    std::unique_ptr<BlockSparseMatrix>>
+CreateFakeBundleAdjustmentPartitionedJacobian(int num_cameras,
+                                              int num_points,
+                                              int camera_size,
+                                              int landmark_size,
+                                              double visibility,
+                                              std::mt19937& rng) {
+  using PartitionedView =
+      PartitionedMatrixView<2, Eigen::Dynamic, Eigen::Dynamic>;
+  auto block_sparse_matrix = CreateFakeBundleAdjustmentJacobian(
+      num_cameras, num_points, camera_size, landmark_size, visibility, rng);
+  auto partitioned_view =
+      std::make_unique<PartitionedView>(*block_sparse_matrix, num_points);
+  return std::make_pair(std::move(partitioned_view),
+                        std::move(block_sparse_matrix));
+}
+
 }  // namespace ceres::internal
diff --git a/internal/ceres/fake_bundle_adjustment_jacobian.h b/internal/ceres/fake_bundle_adjustment_jacobian.h
index 9492b52..41b55c8 100644
--- a/internal/ceres/fake_bundle_adjustment_jacobian.h
+++ b/internal/ceres/fake_bundle_adjustment_jacobian.h
@@ -36,6 +36,7 @@
 #include <random>
 
 #include "ceres/block_sparse_matrix.h"
+#include "ceres/partitioned_matrix_view.h"
 
 namespace ceres::internal {
 std::unique_ptr<BlockSparseMatrix> CreateFakeBundleAdjustmentJacobian(
@@ -46,6 +47,32 @@
     double visibility,
     std::mt19937& prng);
 
+template <int kEBlockSize = 3, int kFBlockSize = 6>
+std::pair<std::unique_ptr<PartitionedMatrixView<2, kEBlockSize, kFBlockSize>>,
+          std::unique_ptr<BlockSparseMatrix>>
+CreateFakeBundleAdjustmentPartitionedJacobian(int num_cameras,
+                                              int num_points,
+                                              double visibility,
+                                              std::mt19937& rng) {
+  using PartitionedView = PartitionedMatrixView<2, kEBlockSize, kFBlockSize>;
+  auto block_sparse_matrix = CreateFakeBundleAdjustmentJacobian(
+      num_cameras, num_points, kFBlockSize, kEBlockSize, visibility, rng);
+  auto partitioned_view =
+      std::make_unique<PartitionedView>(*block_sparse_matrix, num_points);
+  return std::make_pair(std::move(partitioned_view),
+                        std::move(block_sparse_matrix));
+}
+
+std::pair<
+    std::unique_ptr<PartitionedMatrixView<2, Eigen::Dynamic, Eigen::Dynamic>>,
+    std::unique_ptr<BlockSparseMatrix>>
+CreateFakeBundleAdjustmentPartitionedJacobian(int num_cameras,
+                                              int num_points,
+                                              int camera_size,
+                                              int landmark_size,
+                                              double visibility,
+                                              std::mt19937& rng);
+
 }  // namespace ceres::internal
 
 #endif  // CERES_INTERNAL_FAKE_BUNDLE_ADJUSTMENT_JACOBIAN
diff --git a/internal/ceres/implicit_schur_complement.cc b/internal/ceres/implicit_schur_complement.cc
index 9b7ee47..bbd5e4b 100644
--- a/internal/ceres/implicit_schur_complement.cc
+++ b/internal/ceres/implicit_schur_complement.cc
@@ -105,7 +105,8 @@
                                                          double* y) const {
   // y1 = F x
   tmp_rows_.setZero();
-  A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());
+  A_->RightMultiplyAndAccumulateF(
+      x, tmp_rows_.data(), options_.context, options_.num_threads);
 
   // y2 = E' y1
   tmp_e_cols_.setZero();
@@ -114,11 +115,16 @@
   // y3 = -(E'E)^-1 y2
   tmp_e_cols_2_.setZero();
   block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
-                                                          tmp_e_cols_2_.data());
+                                                          tmp_e_cols_2_.data(),
+                                                          options_.context,
+                                                          options_.num_threads);
   tmp_e_cols_2_ *= -1.0;
 
   // y1 = y1 + E y3
-  A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data());
+  A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(),
+                                  tmp_rows_.data(),
+                                  options_.context,
+                                  options_.num_threads);
 
   // y5 = D * x
   if (D_ != nullptr) {
@@ -139,7 +145,8 @@
   CHECK(compute_ftf_inverse_);
   // y1 = F x
   tmp_rows_.setZero();
-  A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());
+  A_->RightMultiplyAndAccumulateF(
+      x, tmp_rows_.data(), options_.context, options_.num_threads);
 
   // y2 = E' y1
   tmp_e_cols_.setZero();
@@ -148,18 +155,23 @@
   // y3 = (E'E)^-1 y2
   tmp_e_cols_2_.setZero();
   block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
-                                                          tmp_e_cols_2_.data());
+                                                          tmp_e_cols_2_.data(),
+                                                          options_.context,
+                                                          options_.num_threads);
   // y1 = E y3
   tmp_rows_.setZero();
-  A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data());
+  A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(),
+                                  tmp_rows_.data(),
+                                  options_.context,
+                                  options_.num_threads);
 
   // y4 = F' y1
   tmp_f_cols_.setZero();
   A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), tmp_f_cols_.data());
 
   // y += (F'F)^-1 y4
-  block_diagonal_FtF_inverse_->RightMultiplyAndAccumulate(tmp_f_cols_.data(),
-                                                          y);
+  block_diagonal_FtF_inverse_->RightMultiplyAndAccumulate(
+      tmp_f_cols_.data(), y, options_.context, options_.num_threads);
 }
 
 // Given a block diagonal matrix and an optional array of diagonal
@@ -197,7 +209,8 @@
 
   // y1 = F x
   tmp_rows_.setZero();
-  A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());
+  A_->RightMultiplyAndAccumulateF(
+      x, tmp_rows_.data(), options_.context, options_.num_threads);
 
   // y2 = b - y1
   tmp_rows_ = ConstVectorRef(b_, num_rows) - tmp_rows_;
@@ -208,8 +221,8 @@
 
   // y = (E'E)^-1 y3
   VectorRef(y, num_cols).setZero();
-  block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
-                                                          y);
+  block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(
+      tmp_e_cols_.data(), y, options_.context, options_.num_threads);
 
   // The full solution vector y has two blocks. The first block of
   // variables corresponds to the eliminated variables, which we just
@@ -232,12 +245,13 @@
 
   // y2 = (E'E)^-1 y1
   Vector y2 = Vector::Zero(A_->num_cols_e());
-  block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
-                                                          y2.data());
+  block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(
+      tmp_e_cols_.data(), y2.data(), options_.context, options_.num_threads);
 
   // y3 = E y2
   tmp_rows_.setZero();
-  A_->RightMultiplyAndAccumulateE(y2.data(), tmp_rows_.data());
+  A_->RightMultiplyAndAccumulateE(
+      y2.data(), tmp_rows_.data(), options_.context, options_.num_threads);
 
   // y3 = b - y3
   tmp_rows_ = ConstVectorRef(b_, A_->num_rows()) - tmp_rows_;
diff --git a/internal/ceres/linear_operator.cc b/internal/ceres/linear_operator.cc
index 597a37e..1b518a6 100644
--- a/internal/ceres/linear_operator.cc
+++ b/internal/ceres/linear_operator.cc
@@ -30,8 +30,34 @@
 
 #include "ceres/linear_operator.h"
 
+#include <glog/logging.h>
+
 namespace ceres::internal {
 
+void LinearOperator::RightMultiplyAndAccumulate(const double* x,
+                                                double* y,
+                                                ContextImpl* context,
+                                                int num_threads) const {
+  (void)context;
+  if (num_threads != 1) {
+    VLOG(3) << "Parallel right product is not supported by linear operator "
+               "implementation";
+  }
+  RightMultiplyAndAccumulate(x, y);
+}
+
+void LinearOperator::LeftMultiplyAndAccumulate(const double* x,
+                                               double* y,
+                                               ContextImpl* context,
+                                               int num_threads) const {
+  (void)context;
+  if (num_threads != 1) {
+    VLOG(3) << "Parallel left product is not supported by linear operator "
+               "implementation";
+  }
+  LeftMultiplyAndAccumulate(x, y);
+}
+
 LinearOperator::~LinearOperator() = default;
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/linear_operator.h b/internal/ceres/linear_operator.h
index 8a1e902..76beb40 100644
--- a/internal/ceres/linear_operator.h
+++ b/internal/ceres/linear_operator.h
@@ -39,6 +39,8 @@
 
 namespace ceres::internal {
 
+class ContextImpl;
+
 // This is an abstract base class for linear operators. It supports
 // access to size information and left and right multiply operators.
 class CERES_NO_EXPORT LinearOperator {
@@ -47,8 +49,16 @@
 
   // y = y + Ax;
   virtual void RightMultiplyAndAccumulate(const double* x, double* y) const = 0;
+  virtual void RightMultiplyAndAccumulate(const double* x,
+                                          double* y,
+                                          ContextImpl* context,
+                                          int num_threads) const;
   // y = y + A'x;
   virtual void LeftMultiplyAndAccumulate(const double* x, double* y) const = 0;
+  virtual void LeftMultiplyAndAccumulate(const double* x,
+                                         double* y,
+                                         ContextImpl* context,
+                                         int num_threads) const;
 
   virtual void RightMultiplyAndAccumulate(const Vector& x, Vector& y) const {
     RightMultiplyAndAccumulate(x.data(), y.data());
@@ -58,6 +68,20 @@
     LeftMultiplyAndAccumulate(x.data(), y.data());
   }
 
+  virtual void RightMultiplyAndAccumulate(const Vector& x,
+                                          Vector& y,
+                                          ContextImpl* context,
+                                          int num_threads) const {
+    RightMultiplyAndAccumulate(x.data(), y.data(), context, num_threads);
+  }
+
+  virtual void LeftMultiplyAndAccumulate(const Vector& x,
+                                         Vector& y,
+                                         ContextImpl* context,
+                                         int num_threads) const {
+    LeftMultiplyAndAccumulate(x.data(), y.data(), context, num_threads);
+  }
+
   virtual int num_rows() const = 0;
   virtual int num_cols() const = 0;
 };
diff --git a/internal/ceres/partitioned_matrix_view.h b/internal/ceres/partitioned_matrix_view.h
index 7fb1a09..86106da 100644
--- a/internal/ceres/partitioned_matrix_view.h
+++ b/internal/ceres/partitioned_matrix_view.h
@@ -52,6 +52,8 @@
 
 namespace ceres::internal {
 
+class ContextImpl;
+
 // Given generalized bi-partite matrix A = [E F], with the same block
 // structure as required by the Schur complement based solver, found
 // in schur_complement_solver.h, provide access to the
@@ -75,10 +77,18 @@
   // y += Ex
   virtual void RightMultiplyAndAccumulateE(const double* x,
                                            double* y) const = 0;
+  virtual void RightMultiplyAndAccumulateE(const double* x,
+                                           double* y,
+                                           ContextImpl* context,
+                                           int num_threads) const = 0;
 
   // y += Fx
   virtual void RightMultiplyAndAccumulateF(const double* x,
                                            double* y) const = 0;
+  virtual void RightMultiplyAndAccumulateF(const double* x,
+                                           double* y,
+                                           ContextImpl* context,
+                                           int num_threads) const = 0;
 
   // Create and return the block diagonal of the matrix E'E.
   virtual std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalEtE() const = 0;
@@ -130,6 +140,14 @@
   void LeftMultiplyAndAccumulateF(const double* x, double* y) const final;
   void RightMultiplyAndAccumulateE(const double* x, double* y) const final;
   void RightMultiplyAndAccumulateF(const double* x, double* y) const final;
+  void RightMultiplyAndAccumulateE(const double* x,
+                                   double* y,
+                                   ContextImpl* context,
+                                   int num_threads) const final;
+  void RightMultiplyAndAccumulateF(const double* x,
+                                   double* y,
+                                   ContextImpl* context,
+                                   int num_threads) const final;
   std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalEtE() const final;
   std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalFtF() const final;
   void UpdateBlockDiagonalEtE(BlockSparseMatrix* block_diagonal) const final;
diff --git a/internal/ceres/partitioned_matrix_view_benchmark.cc b/internal/ceres/partitioned_matrix_view_benchmark.cc
new file mode 100644
index 0000000..63d0e34
--- /dev/null
+++ b/internal/ceres/partitioned_matrix_view_benchmark.cc
@@ -0,0 +1,147 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+#include <memory>
+#include <random>
+
+#include "benchmark/benchmark.h"
+#include "ceres/context_impl.h"
+#include "ceres/fake_bundle_adjustment_jacobian.h"
+#include "ceres/partitioned_matrix_view.h"
+
+constexpr int kNumCameras = 1000;
+constexpr int kNumPoints = 10000;
+constexpr int kCameraSize = 6;
+constexpr int kPointSize = 3;
+constexpr double kVisibility = 0.1;
+
+namespace ceres::internal {
+
+static void BM_PatitionedViewRightMultiplyAndAccumulateE_Static(
+    benchmark::State& state) {
+  const int num_threads = state.range(0);
+  std::mt19937 prng;
+  auto [partitioned_view, jacobian] =
+      CreateFakeBundleAdjustmentPartitionedJacobian<kPointSize, kCameraSize>(
+          kNumCameras, kNumPoints, kVisibility, prng);
+
+  ContextImpl context;
+  context.EnsureMinimumThreads(num_threads);
+
+  Vector x(partitioned_view->num_cols_e());
+  Vector y(partitioned_view->num_rows());
+  x.setRandom();
+  y.setRandom();
+  double sum = 0;
+  for (auto _ : state) {
+    partitioned_view->RightMultiplyAndAccumulateE(
+        x.data(), y.data(), &context, num_threads);
+    sum += y.norm();
+  }
+  CHECK_NE(sum, 0.0);
+}
+BENCHMARK(BM_PatitionedViewRightMultiplyAndAccumulateE_Static)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(4)
+    ->Arg(8)
+    ->Arg(16);
+
+static void BM_PatitionedViewRightMultiplyAndAccumulateE_Dynamic(
+    benchmark::State& state) {
+  std::mt19937 prng;
+  auto [partitioned_view, jacobian] =
+      CreateFakeBundleAdjustmentPartitionedJacobian(
+          kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
+
+  Vector x(partitioned_view->num_cols_e());
+  Vector y(partitioned_view->num_rows());
+  x.setRandom();
+  y.setRandom();
+  double sum = 0;
+  for (auto _ : state) {
+    partitioned_view->RightMultiplyAndAccumulateE(x.data(), y.data());
+    sum += y.norm();
+  }
+  CHECK_NE(sum, 0.0);
+}
+BENCHMARK(BM_PatitionedViewRightMultiplyAndAccumulateE_Dynamic);
+
+static void BM_PatitionedViewRightMultiplyAndAccumulateF_Static(
+    benchmark::State& state) {
+  const int num_threads = state.range(0);
+  std::mt19937 prng;
+  auto [partitioned_view, jacobian] =
+      CreateFakeBundleAdjustmentPartitionedJacobian<kPointSize, kCameraSize>(
+          kNumCameras, kNumPoints, kVisibility, prng);
+
+  ContextImpl context;
+  context.EnsureMinimumThreads(num_threads);
+
+  Vector x(partitioned_view->num_cols_f());
+  Vector y(partitioned_view->num_rows());
+  x.setRandom();
+  y.setRandom();
+  double sum = 0;
+  for (auto _ : state) {
+    partitioned_view->RightMultiplyAndAccumulateF(
+        x.data(), y.data(), &context, num_threads);
+    sum += y.norm();
+  }
+  CHECK_NE(sum, 0.0);
+}
+BENCHMARK(BM_PatitionedViewRightMultiplyAndAccumulateF_Static)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(4)
+    ->Arg(8)
+    ->Arg(16);
+
+static void BM_PatitionedViewRightMultiplyAndAccumulateF_Dynamic(
+    benchmark::State& state) {
+  std::mt19937 prng;
+  auto [partitioned_view, jacobian] =
+      CreateFakeBundleAdjustmentPartitionedJacobian(
+          kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
+
+  Vector x(partitioned_view->num_cols_f());
+  Vector y(partitioned_view->num_rows());
+  x.setRandom();
+  y.setRandom();
+  double sum = 0;
+  for (auto _ : state) {
+    partitioned_view->RightMultiplyAndAccumulateF(x.data(), y.data());
+    sum += y.norm();
+  }
+  CHECK_NE(sum, 0.0);
+}
+BENCHMARK(BM_PatitionedViewRightMultiplyAndAccumulateF_Dynamic);
+
+}  // namespace ceres::internal
+
+BENCHMARK_MAIN();
diff --git a/internal/ceres/partitioned_matrix_view_impl.h b/internal/ceres/partitioned_matrix_view_impl.h
index 2150660..fea0a7d 100644
--- a/internal/ceres/partitioned_matrix_view_impl.h
+++ b/internal/ceres/partitioned_matrix_view_impl.h
@@ -36,6 +36,7 @@
 #include "ceres/block_sparse_matrix.h"
 #include "ceres/block_structure.h"
 #include "ceres/internal/eigen.h"
+#include "ceres/parallel_for.h"
 #include "ceres/partitioned_matrix_view.h"
 #include "ceres/small_blas.h"
 #include "glog/logging.h"
@@ -88,71 +89,100 @@
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
     RightMultiplyAndAccumulateE(const double* x, double* y) const {
-  const CompressedRowBlockStructure* bs = matrix_.block_structure();
+  RightMultiplyAndAccumulateE(x, y, nullptr, 1);
+}
 
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    RightMultiplyAndAccumulateE(const double* x,
+                                double* y,
+                                ContextImpl* context,
+                                int num_threads) const {
   // Iterate over the first num_row_blocks_e_ row blocks, and multiply
   // by the first cell in each row block.
+  auto bs = matrix_.block_structure();
   const double* values = matrix_.values();
-  for (int r = 0; r < num_row_blocks_e_; ++r) {
-    const Cell& cell = bs->rows[r].cells[0];
-    const int row_block_pos = bs->rows[r].block.position;
-    const int row_block_size = bs->rows[r].block.size;
-    const int col_block_id = cell.block_id;
-    const int col_block_pos = bs->cols[col_block_id].position;
-    const int col_block_size = bs->cols[col_block_id].size;
-    // clang-format off
-    MatrixVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
-        values + cell.position, row_block_size, col_block_size,
-        x + col_block_pos,
-        y + row_block_pos);
-    // clang-format on
-  }
+  ParallelFor(context,
+              0,
+              num_row_blocks_e_,
+              num_threads,
+              [values, bs, x, y](int row_block_id) {
+                const Cell& cell = bs->rows[row_block_id].cells[0];
+                const int row_block_pos = bs->rows[row_block_id].block.position;
+                const int row_block_size = bs->rows[row_block_id].block.size;
+                const int col_block_id = cell.block_id;
+                const int col_block_pos = bs->cols[col_block_id].position;
+                const int col_block_size = bs->cols[col_block_id].size;
+                // clang-format off
+                MatrixVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
+                    values + cell.position, row_block_size, col_block_size,
+                    x + col_block_pos,
+                    y + row_block_pos);
+                // clang-format on
+              });
 }
 
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
     RightMultiplyAndAccumulateF(const double* x, double* y) const {
-  const CompressedRowBlockStructure* bs = matrix_.block_structure();
+  RightMultiplyAndAccumulateF(x, y, nullptr, 1);
+}
 
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    RightMultiplyAndAccumulateF(const double* x,
+                                double* y,
+                                ContextImpl* context,
+                                int num_threads) const {
   // Iterate over row blocks, and if the row block is in E, then
   // multiply by all the cells except the first one which is of type
   // E. If the row block is not in E (i.e its in the bottom
   // num_row_blocks - num_row_blocks_e row blocks), then all the cells
   // are of type F and multiply by them all.
+  const CompressedRowBlockStructure* bs = matrix_.block_structure();
+  const int num_row_blocks = bs->rows.size();
+  const int num_cols_e = num_cols_e_;
   const double* values = matrix_.values();
-  for (int r = 0; r < num_row_blocks_e_; ++r) {
-    const int row_block_pos = bs->rows[r].block.position;
-    const int row_block_size = bs->rows[r].block.size;
-    const std::vector<Cell>& cells = bs->rows[r].cells;
-    for (int c = 1; c < cells.size(); ++c) {
-      const int col_block_id = cells[c].block_id;
-      const int col_block_pos = bs->cols[col_block_id].position;
-      const int col_block_size = bs->cols[col_block_id].size;
-      // clang-format off
-      MatrixVectorMultiply<kRowBlockSize, kFBlockSize, 1>(
-          values + cells[c].position, row_block_size, col_block_size,
-          x + col_block_pos - num_cols_e_,
-          y + row_block_pos);
-      // clang-format on
-    }
-  }
-
-  for (int r = num_row_blocks_e_; r < bs->rows.size(); ++r) {
-    const int row_block_pos = bs->rows[r].block.position;
-    const int row_block_size = bs->rows[r].block.size;
-    const std::vector<Cell>& cells = bs->rows[r].cells;
-    for (const auto& cell : cells) {
-      const int col_block_id = cell.block_id;
-      const int col_block_pos = bs->cols[col_block_id].position;
-      const int col_block_size = bs->cols[col_block_id].size;
-      // clang-format off
-      MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
-          values + cell.position, row_block_size, col_block_size,
-          x + col_block_pos - num_cols_e_,
-          y + row_block_pos);
-      // clang-format on
-    }
-  }
+  ParallelFor(context,
+              0,
+              num_row_blocks_e_,
+              num_threads,
+              [values, bs, num_cols_e, x, y](int row_block_id) {
+                const int row_block_pos = bs->rows[row_block_id].block.position;
+                const int row_block_size = bs->rows[row_block_id].block.size;
+                const auto& cells = bs->rows[row_block_id].cells;
+                for (int c = 1; c < cells.size(); ++c) {
+                  const int col_block_id = cells[c].block_id;
+                  const int col_block_pos = bs->cols[col_block_id].position;
+                  const int col_block_size = bs->cols[col_block_id].size;
+                  // clang-format off
+                  MatrixVectorMultiply<kRowBlockSize, kFBlockSize, 1>(
+                      values + cells[c].position, row_block_size, col_block_size,
+                      x + col_block_pos - num_cols_e,
+                      y + row_block_pos);
+                  // clang-format on
+                }
+              });
+  ParallelFor(context,
+              num_row_blocks_e_,
+              num_row_blocks,
+              num_threads,
+              [values, bs, num_cols_e, x, y](int row_block_id) {
+                const int row_block_pos = bs->rows[row_block_id].block.position;
+                const int row_block_size = bs->rows[row_block_id].block.size;
+                const auto& cells = bs->rows[row_block_id].cells;
+                for (const auto& cell : cells) {
+                  const int col_block_id = cell.block_id;
+                  const int col_block_pos = bs->cols[col_block_id].position;
+                  const int col_block_size = bs->cols[col_block_id].size;
+                  // clang-format off
+                  MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+                      values + cell.position, row_block_size, col_block_size,
+                      x + col_block_pos - num_cols_e,
+                      y + row_block_pos);
+                  // clang-format on
+                }
+              });
 }
 
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
diff --git a/internal/ceres/partitioned_matrix_view_test.cc b/internal/ceres/partitioned_matrix_view_test.cc
index e4484dd..b11ea8e 100644
--- a/internal/ceres/partitioned_matrix_view_test.cc
+++ b/internal/ceres/partitioned_matrix_view_test.cc
@@ -174,5 +174,64 @@
   EXPECT_NEAR(block_diagonal_ff->values()[2], 37.0, kEpsilon);
 }
 
+const int kMaxNumThreads = 8;
+class PartitionedMatrixViewParallelTest : public ::testing::TestWithParam<int> {
+ protected:
+  void SetUp() final {
+    std::unique_ptr<LinearLeastSquaresProblem> problem =
+        CreateLinearLeastSquaresProblemFromId(2);
+    CHECK(problem != nullptr);
+    A_ = std::move(problem->A);
+
+    num_cols_ = A_->num_cols();
+    num_rows_ = A_->num_rows();
+    num_eliminate_blocks_ = problem->num_eliminate_blocks;
+    LinearSolver::Options options;
+    options.elimination_groups.push_back(num_eliminate_blocks_);
+    pmv_ = PartitionedMatrixViewBase::Create(
+        options, *down_cast<BlockSparseMatrix*>(A_.get()));
+    context_.EnsureMinimumThreads(kMaxNumThreads);
+  }
+
+  double RandDouble() { return distribution_(prng_); }
+
+  ContextImpl context_;
+  int num_rows_;
+  int num_cols_;
+  int num_eliminate_blocks_;
+  std::unique_ptr<SparseMatrix> A_;
+  std::unique_ptr<PartitionedMatrixViewBase> pmv_;
+  std::mt19937 prng_;
+  std::uniform_real_distribution<double> distribution_ =
+      std::uniform_real_distribution<double>(0.0, 1.0);
+};
+
+TEST_P(PartitionedMatrixViewParallelTest, RightMultiplyAndAccumulateEParallel) {
+  const int kNumThreads = GetParam();
+  Vector x1(pmv_->num_cols_e());
+  Vector x2(pmv_->num_cols());
+  x2.setZero();
+
+  for (int i = 0; i < pmv_->num_cols_e(); ++i) {
+    x1(i) = x2(i) = RandDouble();
+  }
+
+  Vector y1 = Vector::Zero(pmv_->num_rows());
+  pmv_->RightMultiplyAndAccumulateE(
+      x1.data(), y1.data(), &context_, kNumThreads);
+
+  Vector y2 = Vector::Zero(pmv_->num_rows());
+  A_->RightMultiplyAndAccumulate(x2.data(), y2.data());
+
+  for (int i = 0; i < pmv_->num_rows(); ++i) {
+    EXPECT_NEAR(y1(i), y2(i), kEpsilon);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(ParallelProducts,
+                         PartitionedMatrixViewParallelTest,
+                         ::testing::Values(1, 2, 4, 8),
+                         ::testing::PrintToStringParamName());
+
 }  // namespace internal
 }  // namespace ceres
diff --git a/internal/ceres/power_series_expansion_preconditioner_test.cc b/internal/ceres/power_series_expansion_preconditioner_test.cc
index a291405..233d4f7 100644
--- a/internal/ceres/power_series_expansion_preconditioner_test.cc
+++ b/internal/ceres/power_series_expansion_preconditioner_test.cc
@@ -45,10 +45,9 @@
     const auto A = down_cast<BlockSparseMatrix*>(problem_->A.get());
     const auto D = problem_->D.get();
 
-    LinearSolver::Options options;
-    options.elimination_groups.push_back(problem_->num_eliminate_blocks);
-    options.preconditioner_type = SCHUR_POWER_SERIES_EXPANSION;
-    isc_ = std::make_unique<ImplicitSchurComplement>(options);
+    options_.elimination_groups.push_back(problem_->num_eliminate_blocks);
+    options_.preconditioner_type = SCHUR_POWER_SERIES_EXPANSION;
+    isc_ = std::make_unique<ImplicitSchurComplement>(options_);
     isc_->Init(*A, D, problem_->b.get());
     num_f_cols_ = isc_->rhs().rows();
     const int num_rows = A->num_rows(), num_cols = A->num_cols(),
@@ -76,6 +75,7 @@
   std::unique_ptr<ImplicitSchurComplement> isc_;
   int num_f_cols_;
   Matrix sc_inverse_expected_;
+  LinearSolver::Options options_;
 };
 
 TEST_F(PowerSeriesExpansionPreconditionerTest,
@@ -168,4 +168,4 @@
   }
 }
 
-}  // namespace ceres::internal
\ No newline at end of file
+}  // namespace ceres::internal
diff --git a/internal/ceres/sparse_matrix.h b/internal/ceres/sparse_matrix.h
index da6af18..e665c8b 100644
--- a/internal/ceres/sparse_matrix.h
+++ b/internal/ceres/sparse_matrix.h
@@ -41,6 +41,7 @@
 #include "ceres/types.h"
 
 namespace ceres::internal {
+struct ContextImpl;
 
 // This class defines the interface for storing and manipulating
 // sparse matrices. The key property that differentiates different
diff --git a/internal/ceres/spmv_benchmark.cc b/internal/ceres/spmv_benchmark.cc
index 0bd1e43..69df1ea 100644
--- a/internal/ceres/spmv_benchmark.cc
+++ b/internal/ceres/spmv_benchmark.cc
@@ -66,26 +66,37 @@
 
 static void BM_BlockSparseRightMultiplyAndAccumulateBA(
     benchmark::State& state) {
+  const int num_threads = state.range(0);
   std::mt19937 prng;
   auto jacobian = CreateFakeBundleAdjustmentJacobian(
       kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
 
+  ContextImpl context;
+  context.EnsureMinimumThreads(num_threads);
+
   Vector x(jacobian->num_cols());
   Vector y(jacobian->num_rows());
   x.setRandom();
   y.setRandom();
   double sum = 0;
   for (auto _ : state) {
-    jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
+    jacobian->RightMultiplyAndAccumulate(
+        x.data(), y.data(), &context, num_threads);
     sum += y.norm();
   }
   CHECK_NE(sum, 0.0);
 }
 
-BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateBA);
+BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateBA)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(4)
+    ->Arg(8)
+    ->Arg(16);
 
 static void BM_BlockSparseRightMultiplyAndAccumulateUnstructured(
     benchmark::State& state) {
+  const int num_threads = state.range(0);
   BlockSparseMatrix::RandomMatrixOptions options;
   options.num_row_blocks = kNumRowBlocks;
   options.num_col_blocks = kNumColBlocks;
@@ -98,19 +109,28 @@
 
   auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
 
+  ContextImpl context;
+  context.EnsureMinimumThreads(num_threads);
+
   Vector x(jacobian->num_cols());
   Vector y(jacobian->num_rows());
   x.setRandom();
   y.setRandom();
   double sum = 0;
   for (auto _ : state) {
-    jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
+    jacobian->RightMultiplyAndAccumulate(
+        x.data(), y.data(), &context, num_threads);
     sum += y.norm();
   }
   CHECK_NE(sum, 0.0);
 }
 
-BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateUnstructured);
+BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateUnstructured)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(4)
+    ->Arg(8)
+    ->Arg(16);
 
 static void BM_BlockSparseLeftMultiplyAndAccumulateBA(benchmark::State& state) {
   std::mt19937 prng;
@@ -158,6 +178,7 @@
 BENCHMARK(BM_BlockSparseLeftMultiplyAndAccumulateUnstructured);
 
 static void BM_CRSRightMultiplyAndAccumulateBA(benchmark::State& state) {
+  const int num_threads = state.range(0);
   std::mt19937 prng;
   auto bsm_jacobian = CreateFakeBundleAdjustmentJacobian(
       kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
@@ -167,22 +188,32 @@
                                      bsm_jacobian->num_nonzeros());
   bsm_jacobian->ToCompressedRowSparseMatrix(&jacobian);
 
+  ContextImpl context;
+  context.EnsureMinimumThreads(num_threads);
+
   Vector x(jacobian.num_cols());
   Vector y(jacobian.num_rows());
   x.setRandom();
   y.setRandom();
   double sum = 0;
   for (auto _ : state) {
-    jacobian.RightMultiplyAndAccumulate(x.data(), y.data());
+    jacobian.RightMultiplyAndAccumulate(
+        x.data(), y.data(), &context, num_threads);
     sum += y.norm();
   }
   CHECK_NE(sum, 0.0);
 }
 
-BENCHMARK(BM_CRSRightMultiplyAndAccumulateBA);
+BENCHMARK(BM_CRSRightMultiplyAndAccumulateBA)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(4)
+    ->Arg(8)
+    ->Arg(16);
 
 static void BM_CRSRightMultiplyAndAccumulateUnstructured(
     benchmark::State& state) {
+  const int num_threads = state.range(0);
   BlockSparseMatrix::RandomMatrixOptions options;
   options.num_row_blocks = kNumRowBlocks;
   options.num_col_blocks = kNumColBlocks;
@@ -199,19 +230,28 @@
                                      bsm_jacobian->num_nonzeros());
   bsm_jacobian->ToCompressedRowSparseMatrix(&jacobian);
 
+  ContextImpl context;
+  context.EnsureMinimumThreads(num_threads);
+
   Vector x(jacobian.num_cols());
   Vector y(jacobian.num_rows());
   x.setRandom();
   y.setRandom();
   double sum = 0;
   for (auto _ : state) {
-    jacobian.RightMultiplyAndAccumulate(x.data(), y.data());
+    jacobian.RightMultiplyAndAccumulate(
+        x.data(), y.data(), &context, num_threads);
     sum += y.norm();
   }
   CHECK_NE(sum, 0.0);
 }
 
-BENCHMARK(BM_CRSRightMultiplyAndAccumulateUnstructured);
+BENCHMARK(BM_CRSRightMultiplyAndAccumulateUnstructured)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(4)
+    ->Arg(8)
+    ->Arg(16);
 
 static void BM_CRSLeftMultiplyAndAccumulateBA(benchmark::State& state) {
   std::mt19937 prng;