Parallel left products for PartitionedMatrixView

Parallel left products for PartitionedMatrixView using parallel for
loops with fair partitioning.
Updates for evaluation benchmark including replacing program with a
preprocessed one

Change-Id: Ia1cc3293f106cec6b7d933675cea4e7c5a6b71e4
diff --git a/internal/ceres/bundle_adjustment_test_util.h b/internal/ceres/bundle_adjustment_test_util.h
index 6b45d46..696b39e 100644
--- a/internal/ceres/bundle_adjustment_test_util.h
+++ b/internal/ceres/bundle_adjustment_test_util.h
@@ -84,14 +84,15 @@
   Solver::Options* mutable_solver_options() { return &options_; }
 
   // clang-format off
-  int num_cameras()            const { return num_cameras_; }
-  int num_points()             const { return num_points_; }
-  int num_observations()       const { return num_observations_; }
-  const int* point_index()     const { return point_index_; }
-  const int* camera_index()    const { return camera_index_; }
-  const double* observations() const { return observations_; }
-  double* mutable_cameras()          { return parameters_; }
-  double* mutable_points()           { return parameters_ + 9 * num_cameras_; }
+  int num_cameras()                const { return num_cameras_; }
+  int num_points()                 const { return num_points_; }
+  int num_observations()           const { return num_observations_; }
+  const int* point_index()         const { return point_index_; }
+  const int* camera_index()        const { return camera_index_; }
+  const double* observations()     const { return observations_; }
+  double* mutable_cameras()              { return parameters_; }
+  double* mutable_points()               { return parameters_ + 9 * num_cameras_; }
+  const Solver::Options& options() const { return options_; }
   // clang-format on
 
   static double kResidualTolerance;
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index 56b4f19..aba2386 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc
@@ -39,6 +39,8 @@
 #include "ceres/cuda_sparse_matrix.h"
 #include "ceres/cuda_vector.h"
 #include "ceres/evaluator.h"
+#include "ceres/partitioned_matrix_view.h"
+#include "ceres/preprocessor.h"
 #include "ceres/problem.h"
 #include "ceres/problem_impl.h"
 #include "ceres/program.h"
@@ -56,12 +58,22 @@
 // each dataset is being loaded at most once.
 // Each type of jacobians is also cached after first creation
 struct BALData {
+  using PartitionedView = PartitionedMatrixView<2, 3, 9>;
   explicit BALData(const std::string& path) {
     bal_problem = std::make_unique<BundleAdjustmentProblem>(path);
     CHECK(bal_problem != nullptr);
 
     auto problem_impl = bal_problem->mutable_problem()->mutable_impl();
-    auto program = problem_impl->mutable_program();
+    auto preprocessor = Preprocessor::Create(MinimizerType::TRUST_REGION);
+
+    preprocessed_problem = std::make_unique<PreprocessedProblem>();
+    Solver::Options options = bal_problem->options();
+    options.linear_solver_type = ITERATIVE_SCHUR;
+    CHECK(preprocessor->Preprocess(
+        options, problem_impl, preprocessed_problem.get()));
+
+    auto program = preprocessed_problem->reduced_program.get();
+
     parameters.resize(program->NumParameters());
     program->ParameterBlocksToStateVector(parameters.data());
   }
@@ -76,10 +88,10 @@
     options.linear_solver_type = ITERATIVE_SCHUR;
     options.num_threads = 1;
     options.context = context;
-    options.num_eliminate_blocks = 0;
+    options.num_eliminate_blocks = bal_problem->num_points();
 
     std::string error;
-    auto program = problem_impl->mutable_program();
+    auto program = preprocessed_problem->reduced_program.get();
     auto evaluator = Evaluator::Create(options, program, &error);
     CHECK(evaluator != nullptr);
 
@@ -134,19 +146,39 @@
     return crs_jacobian.get();
   }
 
+  const PartitionedView* PartitionedMatrixViewJacobian(ContextImpl* context) {
+    if (!partitioned_view_jacobian) {
+      auto block_sparse = BlockSparseJacobian(context);
+      partitioned_view_jacobian = std::make_unique<PartitionedView>(
+          *block_sparse, bal_problem->num_points());
+    }
+    return partitioned_view_jacobian.get();
+  }
+
+  const PartitionedView* PartitionedMatrixViewJacobianWithTranspose(
+      ContextImpl* context) {
+    if (!partitioned_view_jacobian_with_transpose) {
+      auto block_sparse_transpose = BlockSparseJacobianWithTranspose(context);
+      partitioned_view_jacobian_with_transpose =
+          std::make_unique<PartitionedView>(*block_sparse_transpose,
+                                            bal_problem->num_points());
+    }
+    return partitioned_view_jacobian_with_transpose.get();
+  }
+
   Vector parameters;
   std::unique_ptr<BundleAdjustmentProblem> bal_problem;
+  std::unique_ptr<PreprocessedProblem> preprocessed_problem;
   std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian;
   std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian_with_transpose;
   std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian;
+  std::unique_ptr<PartitionedView> partitioned_view_jacobian;
+  std::unique_ptr<PartitionedView> partitioned_view_jacobian_with_transpose;
 };
 
 static void Residuals(benchmark::State& state,
                       BALData* data,
                       ContextImpl* context) {
-  auto problem = data->bal_problem->mutable_problem();
-  auto problem_impl = problem->mutable_impl();
-  CHECK(problem_impl != nullptr);
   const int num_threads = state.range(0);
 
   Evaluator::Options options;
@@ -156,7 +188,9 @@
   options.num_eliminate_blocks = 0;
 
   std::string error;
-  auto program = problem_impl->mutable_program();
+  CHECK(data->preprocessed_problem != nullptr);
+  auto program = data->preprocessed_problem->reduced_program.get();
+  CHECK(program != nullptr);
   auto evaluator = Evaluator::Create(options, program, &error);
   CHECK(evaluator != nullptr);
 
@@ -177,9 +211,6 @@
 static void ResidualsAndJacobian(benchmark::State& state,
                                  BALData* data,
                                  ContextImpl* context) {
-  auto problem = data->bal_problem->mutable_problem();
-  auto problem_impl = problem->mutable_impl();
-  CHECK(problem_impl != nullptr);
   const int num_threads = state.range(0);
 
   Evaluator::Options options;
@@ -189,7 +220,9 @@
   options.num_eliminate_blocks = 0;
 
   std::string error;
-  auto program = problem_impl->mutable_program();
+  CHECK(data->preprocessed_problem != nullptr);
+  auto program = data->preprocessed_problem->reduced_program.get();
+  CHECK(program != nullptr);
   auto evaluator = Evaluator::Create(options, program, &error);
   CHECK(evaluator != nullptr);
 
@@ -208,6 +241,107 @@
   }
 }
 
+static void PMVRightMultiplyAndAccumulateF(benchmark::State& state,
+                                           BALData* data,
+                                           ContextImpl* context) {
+  const int num_threads = state.range(0);
+
+  auto jacobian = data->PartitionedMatrixViewJacobian(context);
+
+  Vector y = Vector::Zero(jacobian->num_rows());
+  Vector x = Vector::Random(jacobian->num_cols_f());
+
+  for (auto _ : state) {
+    jacobian->RightMultiplyAndAccumulateF(
+        x.data(), y.data(), context, num_threads);
+  }
+  CHECK_GT(y.squaredNorm(), 0.);
+}
+
+static void PMVLeftMultiplyAndAccumulateF(benchmark::State& state,
+                                          BALData* data,
+                                          ContextImpl* context) {
+  const int num_threads = state.range(0);
+
+  auto jacobian = data->PartitionedMatrixViewJacobian(context);
+
+  Vector y = Vector::Zero(jacobian->num_cols_f());
+  Vector x = Vector::Random(jacobian->num_rows());
+
+  for (auto _ : state) {
+    jacobian->LeftMultiplyAndAccumulateF(
+        x.data(), y.data(), context, num_threads);
+  }
+  CHECK_GT(y.squaredNorm(), 0.);
+}
+
+static void PMVLeftMultiplyAndAccumulateWithTransposeF(benchmark::State& state,
+                                                       BALData* data,
+                                                       ContextImpl* context) {
+  const int num_threads = state.range(0);
+
+  auto jacobian = data->PartitionedMatrixViewJacobianWithTranspose(context);
+
+  Vector y = Vector::Zero(jacobian->num_cols_f());
+  Vector x = Vector::Random(jacobian->num_rows());
+
+  for (auto _ : state) {
+    jacobian->LeftMultiplyAndAccumulateF(
+        x.data(), y.data(), context, num_threads);
+  }
+  CHECK_GT(y.squaredNorm(), 0.);
+}
+static void PMVRightMultiplyAndAccumulateE(benchmark::State& state,
+                                           BALData* data,
+                                           ContextImpl* context) {
+  const int num_threads = state.range(0);
+
+  auto jacobian = data->PartitionedMatrixViewJacobian(context);
+
+  Vector y = Vector::Zero(jacobian->num_rows());
+  Vector x = Vector::Random(jacobian->num_cols_e());
+
+  for (auto _ : state) {
+    jacobian->RightMultiplyAndAccumulateE(
+        x.data(), y.data(), context, num_threads);
+  }
+  CHECK_GT(y.squaredNorm(), 0.);
+}
+
+static void PMVLeftMultiplyAndAccumulateE(benchmark::State& state,
+                                          BALData* data,
+                                          ContextImpl* context) {
+  const int num_threads = state.range(0);
+
+  auto jacobian = data->PartitionedMatrixViewJacobian(context);
+
+  Vector y = Vector::Zero(jacobian->num_cols_e());
+  Vector x = Vector::Random(jacobian->num_rows());
+
+  for (auto _ : state) {
+    jacobian->LeftMultiplyAndAccumulateE(
+        x.data(), y.data(), context, num_threads);
+  }
+  CHECK_GT(y.squaredNorm(), 0.);
+}
+
+static void PMVLeftMultiplyAndAccumulateWithTransposeE(benchmark::State& state,
+                                                       BALData* data,
+                                                       ContextImpl* context) {
+  const int num_threads = state.range(0);
+
+  auto jacobian = data->PartitionedMatrixViewJacobianWithTranspose(context);
+
+  Vector y = Vector::Zero(jacobian->num_cols_e());
+  Vector x = Vector::Random(jacobian->num_rows());
+
+  for (auto _ : state) {
+    jacobian->LeftMultiplyAndAccumulateE(
+        x.data(), y.data(), context, num_threads);
+  }
+  CHECK_GT(y.squaredNorm(), 0.);
+}
+
 static void JacobianRightMultiplyAndAccumulate(benchmark::State& state,
                                                BALData* data,
                                                ContextImpl* context) {
@@ -349,6 +483,7 @@
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);
+
     const std::string name_jacobians = "ResidualsAndJacobian<" + path + ">";
     ::benchmark::RegisterBenchmark(name_jacobians.c_str(),
                                    ceres::internal::ResidualsAndJacobian,
@@ -359,6 +494,7 @@
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);
+
     const std::string name_right_product =
         "JacobianRightMultiplyAndAccumulate<" + path + ">";
     ::benchmark::RegisterBenchmark(
@@ -372,6 +508,32 @@
         ->Arg(8)
         ->Arg(16);
 
+    const std::string name_right_product_partitioned_f =
+        "PMVRightMultiplyAndAccumulateF<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_right_product_partitioned_f.c_str(),
+        ceres::internal::PMVRightMultiplyAndAccumulateF,
+        data,
+        &context)
+        ->Arg(1)
+        ->Arg(2)
+        ->Arg(4)
+        ->Arg(8)
+        ->Arg(16);
+
+    const std::string name_right_product_partitioned_e =
+        "PMVRightMultiplyAndAccumulateE<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_right_product_partitioned_e.c_str(),
+        ceres::internal::PMVRightMultiplyAndAccumulateE,
+        data,
+        &context)
+        ->Arg(1)
+        ->Arg(2)
+        ->Arg(4)
+        ->Arg(8)
+        ->Arg(16);
+
 #ifndef CERES_NO_CUDA
     const std::string name_right_product_cuda =
         "JacobianRightMultiplyAndAccumulateCuda<" + path + ">";
@@ -403,8 +565,50 @@
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
-        ->Arg(16)
-        ->Arg(28);
+        ->Arg(16);
+
+    const std::string name_left_product_partitioned_f =
+        "PMVLeftMultiplyAndAccumulateF<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_left_product_partitioned_f.c_str(),
+        ceres::internal::PMVLeftMultiplyAndAccumulateF,
+        data,
+        &context)
+        ->Arg(1);
+    const std::string name_left_product_partitioned_transpose_f =
+        "PMVLeftMultiplyAndAccumulateWithTransposeF<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_left_product_partitioned_transpose_f.c_str(),
+        ceres::internal::PMVLeftMultiplyAndAccumulateWithTransposeF,
+        data,
+        &context)
+        ->Arg(1)
+        ->Arg(2)
+        ->Arg(4)
+        ->Arg(8)
+        ->Arg(16);
+
+    const std::string name_left_product_partitioned_e =
+        "PMVLeftMultiplyAndAccumulateE<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_left_product_partitioned_e.c_str(),
+        ceres::internal::PMVLeftMultiplyAndAccumulateE,
+        data,
+        &context)
+        ->Arg(1);
+
+    const std::string name_left_product_partitioned_transpose_e =
+        "PMVLeftMultiplyAndAccumulateWithTransposeE<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_left_product_partitioned_transpose_e.c_str(),
+        ceres::internal::PMVLeftMultiplyAndAccumulateWithTransposeE,
+        data,
+        &context)
+        ->Arg(1)
+        ->Arg(2)
+        ->Arg(4)
+        ->Arg(8)
+        ->Arg(16);
 
 #ifndef CERES_NO_CUDA
     const std::string name_left_product_cuda =
diff --git a/internal/ceres/implicit_schur_complement.cc b/internal/ceres/implicit_schur_complement.cc
index bbd5e4b..6249c8c 100644
--- a/internal/ceres/implicit_schur_complement.cc
+++ b/internal/ceres/implicit_schur_complement.cc
@@ -110,7 +110,10 @@
 
   // y2 = E' y1
   tmp_e_cols_.setZero();
-  A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());
+  A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(),
+                                 tmp_e_cols_.data(),
+                                 options_.context,
+                                 options_.num_threads);
 
   // y3 = -(E'E)^-1 y2
   tmp_e_cols_2_.setZero();
@@ -137,7 +140,8 @@
   }
 
   // y = y5 + F' y1
-  A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), y);
+  A_->LeftMultiplyAndAccumulateF(
+      tmp_rows_.data(), y, options_.context, options_.num_threads);
 }
 
 void ImplicitSchurComplement::InversePowerSeriesOperatorRightMultiplyAccumulate(
@@ -150,7 +154,10 @@
 
   // y2 = E' y1
   tmp_e_cols_.setZero();
-  A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());
+  A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(),
+                                 tmp_e_cols_.data(),
+                                 options_.context,
+                                 options_.num_threads);
 
   // y3 = (E'E)^-1 y2
   tmp_e_cols_2_.setZero();
@@ -167,7 +174,10 @@
 
   // y4 = F' y1
   tmp_f_cols_.setZero();
-  A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), tmp_f_cols_.data());
+  A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(),
+                                 tmp_f_cols_.data(),
+                                 options_.context,
+                                 options_.num_threads);
 
   // y += (F'F)^-1 y4
   block_diagonal_FtF_inverse_->RightMultiplyAndAccumulate(
@@ -217,7 +227,10 @@
 
   // y3 = E' y2
   tmp_e_cols_.setZero();
-  A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());
+  A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(),
+                                 tmp_e_cols_.data(),
+                                 options_.context,
+                                 options_.num_threads);
 
   // y = (E'E)^-1 y3
   VectorRef(y, num_cols).setZero();
@@ -241,7 +254,8 @@
 void ImplicitSchurComplement::UpdateRhs() {
   // y1 = E'b
   tmp_e_cols_.setZero();
-  A_->LeftMultiplyAndAccumulateE(b_, tmp_e_cols_.data());
+  A_->LeftMultiplyAndAccumulateE(
+      b_, tmp_e_cols_.data(), options_.context, options_.num_threads);
 
   // y2 = (E'E)^-1 y1
   Vector y2 = Vector::Zero(A_->num_cols_e());
@@ -258,7 +272,8 @@
 
   // rhs = F' y3
   rhs_.setZero();
-  A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), rhs_.data());
+  A_->LeftMultiplyAndAccumulateF(
+      tmp_rows_.data(), rhs_.data(), options_.context, options_.num_threads);
 }
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/iterative_schur_complement_solver.cc b/internal/ceres/iterative_schur_complement_solver.cc
index db4ad2b..589a1d2 100644
--- a/internal/ceres/iterative_schur_complement_solver.cc
+++ b/internal/ceres/iterative_schur_complement_solver.cc
@@ -68,6 +68,9 @@
   EventLogger event_logger("IterativeSchurComplementSolver::Solve");
 
   CHECK(A->block_structure() != nullptr);
+  if (options_.num_threads != 1) {
+    A->AddTransposeBlockStructure();
+  }
   const int num_eliminate_blocks = options_.elimination_groups[0];
   // Initialize a ImplicitSchurComplement object.
   if (schur_complement_ == nullptr) {
diff --git a/internal/ceres/linear_least_squares_problems.cc b/internal/ceres/linear_least_squares_problems.cc
index 6cee2f6..4c55fbc 100644
--- a/internal/ceres/linear_least_squares_problems.cc
+++ b/internal/ceres/linear_least_squares_problems.cc
@@ -63,6 +63,8 @@
       return LinearLeastSquaresProblem4();
     case 5:
       return LinearLeastSquaresProblem5();
+    case 6:
+      return LinearLeastSquaresProblem6();
     default:
       LOG(FATAL) << "Unknown problem id requested " << id;
   }
@@ -798,6 +800,133 @@
   return problem;
 }
 
+/*
+      A = [1 2 0 0 0 1 1
+           1 4 0 0 0 5 6
+           3 4 0 0 0 7 8
+           5 6 0 0 0 9 0
+           0 0 9 0 0 3 1]
+
+      b = [0
+           1
+           2
+           3
+           4]
+*/
+// BlockSparseMatrix version
+//
+// This problem has the unique property that it has two different
+// sized f-blocks, but only one of them occurs in the rows involving
+// the one e-block. So performing Schur elimination on this problem
+// tests the Schur Eliminator's ability to handle non-e-block rows
+// correctly when their structure does not conform to the static
+// structure determined by DetectStructure.
+//
+// Additionally, this problem has the first row of the last row block of E being
+// larger than number of row blocks in E
+//
+// NOTE: This problem is too small and rank deficient to be solved without
+// the diagonal regularization.
+std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem6() {
+  int num_rows = 5;
+  int num_cols = 7;
+
+  auto problem = std::make_unique<LinearLeastSquaresProblem>();
+
+  problem->b = std::make_unique<double[]>(num_rows);
+  problem->D = std::make_unique<double[]>(num_cols);
+  problem->num_eliminate_blocks = 1;
+
+  auto* bs = new CompressedRowBlockStructure;
+  auto values = std::make_unique<double[]>(num_rows * num_cols);
+
+  // Column block structure
+  bs->cols.emplace_back();
+  bs->cols.back().size = 2;
+  bs->cols.back().position = 0;
+
+  bs->cols.emplace_back();
+  bs->cols.back().size = 3;
+  bs->cols.back().position = 2;
+
+  bs->cols.emplace_back();
+  bs->cols.back().size = 2;
+  bs->cols.back().position = 5;
+
+  int nnz = 0;
+
+  // Row 1 & 2
+  {
+    bs->rows.emplace_back();
+    CompressedRow& row = bs->rows.back();
+    row.block.size = 2;
+    row.block.position = 0;
+
+    row.cells.emplace_back(0, nnz);
+    values[nnz++] = 1;
+    values[nnz++] = 2;
+    values[nnz++] = 1;
+    values[nnz++] = 4;
+
+    row.cells.emplace_back(2, nnz);
+    values[nnz++] = 1;
+    values[nnz++] = 1;
+    values[nnz++] = 5;
+    values[nnz++] = 6;
+  }
+
+  // Row 3 & 4
+  {
+    bs->rows.emplace_back();
+    CompressedRow& row = bs->rows.back();
+    row.block.size = 2;
+    row.block.position = 2;
+
+    row.cells.emplace_back(0, nnz);
+    values[nnz++] = 3;
+    values[nnz++] = 4;
+    values[nnz++] = 5;
+    values[nnz++] = 6;
+
+    row.cells.emplace_back(2, nnz);
+    values[nnz++] = 7;
+    values[nnz++] = 8;
+    values[nnz++] = 9;
+    values[nnz++] = 0;
+  }
+
+  // Row 5
+  {
+    bs->rows.emplace_back();
+    CompressedRow& row = bs->rows.back();
+    row.block.size = 1;
+    row.block.position = 4;
+
+    row.cells.emplace_back(1, nnz);
+    values[nnz++] = 9;
+    values[nnz++] = 0;
+    values[nnz++] = 0;
+
+    row.cells.emplace_back(2, nnz);
+    values[nnz++] = 3;
+    values[nnz++] = 1;
+  }
+
+  auto A = std::make_unique<BlockSparseMatrix>(bs);
+  memcpy(A->mutable_values(), values.get(), nnz * sizeof(*A->values()));
+
+  for (int i = 0; i < num_cols; ++i) {
+    problem->D.get()[i] = (i + 1) * 100;
+  }
+
+  for (int i = 0; i < num_rows; ++i) {
+    problem->b.get()[i] = i;
+  }
+
+  problem->A = std::move(A);
+  return problem;
+}
+
 namespace {
 bool DumpLinearLeastSquaresProblemToConsole(const SparseMatrix* A,
                                             const double* D,
diff --git a/internal/ceres/linear_least_squares_problems.h b/internal/ceres/linear_least_squares_problems.h
index 0e14517..c56a338 100644
--- a/internal/ceres/linear_least_squares_problems.h
+++ b/internal/ceres/linear_least_squares_problems.h
@@ -75,6 +75,8 @@
 std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem4();
 CERES_NO_EXPORT
 std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem5();
+CERES_NO_EXPORT
+std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem6();
 
 // Write the linear least squares problem to disk. The exact format
 // depends on dump_format_type.
diff --git a/internal/ceres/partitioned_matrix_view.h b/internal/ceres/partitioned_matrix_view.h
index 86106da..533a0e5 100644
--- a/internal/ceres/partitioned_matrix_view.h
+++ b/internal/ceres/partitioned_matrix_view.h
@@ -70,9 +70,17 @@
 
   // y += E'x
   virtual void LeftMultiplyAndAccumulateE(const double* x, double* y) const = 0;
+  virtual void LeftMultiplyAndAccumulateE(const double* x,
+                                          double* y,
+                                          ContextImpl* context,
+                                          int num_threads) const = 0;
 
   // y += F'x
   virtual void LeftMultiplyAndAccumulateF(const double* x, double* y) const = 0;
+  virtual void LeftMultiplyAndAccumulateF(const double* x,
+                                          double* y,
+                                          ContextImpl* context,
+                                          int num_threads) const = 0;
 
   // y += Ex
   virtual void RightMultiplyAndAccumulateE(const double* x,
@@ -138,6 +146,14 @@
 
   void LeftMultiplyAndAccumulateE(const double* x, double* y) const final;
   void LeftMultiplyAndAccumulateF(const double* x, double* y) const final;
+  void LeftMultiplyAndAccumulateE(const double* x,
+                                  double* y,
+                                  ContextImpl* context,
+                                  int num_threads) const final;
+  void LeftMultiplyAndAccumulateF(const double* x,
+                                  double* y,
+                                  ContextImpl* context,
+                                  int num_threads) const final;
   void RightMultiplyAndAccumulateE(const double* x, double* y) const final;
   void RightMultiplyAndAccumulateF(const double* x, double* y) const final;
   void RightMultiplyAndAccumulateE(const double* x,
diff --git a/internal/ceres/partitioned_matrix_view_impl.h b/internal/ceres/partitioned_matrix_view_impl.h
index fea0a7d..689f09e 100644
--- a/internal/ceres/partitioned_matrix_view_impl.h
+++ b/internal/ceres/partitioned_matrix_view_impl.h
@@ -188,6 +188,9 @@
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
     LeftMultiplyAndAccumulateE(const double* x, double* y) const {
+  if (!num_col_blocks_e_) return;
+  if (!num_row_blocks_e_) return;
+
   const CompressedRowBlockStructure* bs = matrix_.block_structure();
 
   // Iterate over the first num_row_blocks_e_ row blocks, and multiply
@@ -211,7 +214,54 @@
 
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    LeftMultiplyAndAccumulateE(const double* x,
+                               double* y,
+                               ContextImpl* context,
+                               int num_threads) const {
+  if (!num_col_blocks_e_) return;
+  if (!num_row_blocks_e_) return;
+
+  auto transpose_bs = matrix_.transpose_block_structure();
+  if (transpose_bs == nullptr || num_threads == 1) {
+    LeftMultiplyAndAccumulateE(x, y);
+    return;
+  }
+
+  // Local copies of class members in order to avoid capturing pointer to the
+  // whole object in lambda function
+  auto values = matrix_.values();
+  const int num_row_blocks_e = num_row_blocks_e_;
+  ParallelFor(
+      context,
+      0,
+      num_col_blocks_e_,
+      num_threads,
+      [values, transpose_bs, num_row_blocks_e, x, y](int row_block_id) {
+        int row_block_pos = transpose_bs->rows[row_block_id].block.position;
+        int row_block_size = transpose_bs->rows[row_block_id].block.size;
+        auto& cells = transpose_bs->rows[row_block_id].cells;
+
+        for (auto& cell : cells) {
+          const int col_block_id = cell.block_id;
+          const int col_block_size = transpose_bs->cols[col_block_id].size;
+          const int col_block_pos = transpose_bs->cols[col_block_id].position;
+          if (col_block_id >= num_row_blocks_e) break;
+          MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
+              values + cell.position,
+              col_block_size,
+              row_block_size,
+              x + col_block_pos,
+              y + row_block_pos);
+        }
+      },
+      transpose_bs->rows.data(),
+      [](const CompressedRow& row) { return row.cumulative_nnz; });
+}
+
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
     LeftMultiplyAndAccumulateF(const double* x, double* y) const {
+  if (!num_col_blocks_f_) return;
   const CompressedRowBlockStructure* bs = matrix_.block_structure();
 
   // Iterate over row blocks, and if the row block is in E, then
@@ -255,6 +305,67 @@
   }
 }
 
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    LeftMultiplyAndAccumulateF(const double* x,
+                               double* y,
+                               ContextImpl* context,
+                               int num_threads) const {
+  if (!num_col_blocks_f_) return;
+  auto transpose_bs = matrix_.transpose_block_structure();
+  if (transpose_bs == nullptr || num_threads == 1) {
+    LeftMultiplyAndAccumulateF(x, y);
+    return;
+  }
+  // Local copies of class members  in order to avoid capturing pointer to the
+  // whole object in lambda function
+  auto values = matrix_.values();
+  const int num_row_blocks_e = num_row_blocks_e_;
+  const int num_cols_e = num_cols_e_;
+  ParallelFor(
+      context,
+      num_col_blocks_e_,
+      num_col_blocks_e_ + num_col_blocks_f_,
+      num_threads,
+      [values, transpose_bs, num_row_blocks_e, num_cols_e, x, y](
+          int row_block_id) {
+        int row_block_pos = transpose_bs->rows[row_block_id].block.position;
+        int row_block_size = transpose_bs->rows[row_block_id].block.size;
+        auto& cells = transpose_bs->rows[row_block_id].cells;
+
+        const int num_cells = cells.size();
+        int cell_idx = 0;
+        for (; cell_idx < num_cells; ++cell_idx) {
+          auto& cell = cells[cell_idx];
+          const int col_block_id = cell.block_id;
+          const int col_block_size = transpose_bs->cols[col_block_id].size;
+          const int col_block_pos = transpose_bs->cols[col_block_id].position;
+          if (col_block_id >= num_row_blocks_e) break;
+
+          MatrixTransposeVectorMultiply<kRowBlockSize, kFBlockSize, 1>(
+              values + cell.position,
+              col_block_size,
+              row_block_size,
+              x + col_block_pos,
+              y + row_block_pos - num_cols_e);
+        }
+        for (; cell_idx < num_cells; ++cell_idx) {
+          auto& cell = cells[cell_idx];
+          const int col_block_id = cell.block_id;
+          const int col_block_size = transpose_bs->cols[col_block_id].size;
+          const int col_block_pos = transpose_bs->cols[col_block_id].position;
+          MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+              values + cell.position,
+              col_block_size,
+              row_block_size,
+              x + col_block_pos,
+              y + row_block_pos - num_cols_e);
+        }
+      },
+      transpose_bs->rows.data(),
+      [](const CompressedRow& row) { return row.cumulative_nnz; });
+}
+
 // Given a range of columns blocks of a matrix m, compute the block
 // structure of the block diagonal of the matrix m(:,
 // start_col_block:end_col_block)'m(:, start_col_block:end_col_block)
diff --git a/internal/ceres/partitioned_matrix_view_test.cc b/internal/ceres/partitioned_matrix_view_test.cc
index b11ea8e..4653e06 100644
--- a/internal/ceres/partitioned_matrix_view_test.cc
+++ b/internal/ceres/partitioned_matrix_view_test.cc
@@ -177,30 +177,40 @@
 const int kMaxNumThreads = 8;
 class PartitionedMatrixViewParallelTest : public ::testing::TestWithParam<int> {
  protected:
+  static const int kNumProblems = 3;
   void SetUp() final {
-    std::unique_ptr<LinearLeastSquaresProblem> problem =
-        CreateLinearLeastSquaresProblemFromId(2);
-    CHECK(problem != nullptr);
-    A_ = std::move(problem->A);
+    int problem_ids[kNumProblems] = {2, 4, 6};
+    for (int i = 0; i < kNumProblems; ++i) {
+      auto problem = CreateLinearLeastSquaresProblemFromId(problem_ids[i]);
+      CHECK(problem != nullptr);
+      SetUpMatrix(i, problem.get());
+    }
 
-    num_cols_ = A_->num_cols();
-    num_rows_ = A_->num_rows();
-    num_eliminate_blocks_ = problem->num_eliminate_blocks;
-    LinearSolver::Options options;
-    options.elimination_groups.push_back(num_eliminate_blocks_);
-    pmv_ = PartitionedMatrixViewBase::Create(
-        options, *down_cast<BlockSparseMatrix*>(A_.get()));
     context_.EnsureMinimumThreads(kMaxNumThreads);
   }
 
+  void SetUpMatrix(int id, LinearLeastSquaresProblem* problem) {
+    A_[id] = std::move(problem->A);
+    auto& A = A_[id];
+    auto block_sparse = down_cast<BlockSparseMatrix*>(A.get());
+    block_sparse->AddTransposeBlockStructure();
+
+    num_cols_[id] = A->num_cols();
+    num_rows_[id] = A->num_rows();
+    num_eliminate_blocks_[id] = problem->num_eliminate_blocks;
+    LinearSolver::Options options;
+    options.elimination_groups.push_back(num_eliminate_blocks_[id]);
+    pmv_[id] = PartitionedMatrixViewBase::Create(options, *block_sparse);
+  }
+
   double RandDouble() { return distribution_(prng_); }
 
   ContextImpl context_;
-  int num_rows_;
-  int num_cols_;
-  int num_eliminate_blocks_;
-  std::unique_ptr<SparseMatrix> A_;
-  std::unique_ptr<PartitionedMatrixViewBase> pmv_;
+  int num_rows_[kNumProblems];
+  int num_cols_[kNumProblems];
+  int num_eliminate_blocks_[kNumProblems];
+  std::unique_ptr<SparseMatrix> A_[kNumProblems];
+  std::unique_ptr<PartitionedMatrixViewBase> pmv_[kNumProblems];
   std::mt19937 prng_;
   std::uniform_real_distribution<double> distribution_ =
       std::uniform_real_distribution<double>(0.0, 1.0);
@@ -208,23 +218,83 @@
 
 TEST_P(PartitionedMatrixViewParallelTest, RightMultiplyAndAccumulateEParallel) {
   const int kNumThreads = GetParam();
-  Vector x1(pmv_->num_cols_e());
-  Vector x2(pmv_->num_cols());
-  x2.setZero();
+  for (int p = 0; p < kNumProblems; ++p) {
+    auto& pmv = pmv_[p];
+    auto& A = A_[p];
 
-  for (int i = 0; i < pmv_->num_cols_e(); ++i) {
-    x1(i) = x2(i) = RandDouble();
+    Vector x1(pmv->num_cols_e());
+    Vector x2(pmv->num_cols());
+    x2.setZero();
+
+    for (int i = 0; i < pmv->num_cols_e(); ++i) {
+      x1(i) = x2(i) = RandDouble();
+    }
+
+    Vector y1 = Vector::Zero(pmv->num_rows());
+    pmv->RightMultiplyAndAccumulateE(
+        x1.data(), y1.data(), &context_, kNumThreads);
+
+    Vector y2 = Vector::Zero(pmv->num_rows());
+    A->RightMultiplyAndAccumulate(x2.data(), y2.data());
+
+    for (int i = 0; i < pmv->num_rows(); ++i) {
+      EXPECT_NEAR(y1(i), y2(i), kEpsilon);
+    }
   }
+}
 
-  Vector y1 = Vector::Zero(pmv_->num_rows());
-  pmv_->RightMultiplyAndAccumulateE(
-      x1.data(), y1.data(), &context_, kNumThreads);
+TEST_P(PartitionedMatrixViewParallelTest, RightMultiplyAndAccumulateFParallel) {
+  const int kNumThreads = GetParam();
+  for (int p = 0; p < kNumProblems; ++p) {
+    auto& pmv = pmv_[p];
+    auto& A = A_[p];
+    Vector x1(pmv->num_cols_f());
+    Vector x2(pmv->num_cols());
+    x2.setZero();
 
-  Vector y2 = Vector::Zero(pmv_->num_rows());
-  A_->RightMultiplyAndAccumulate(x2.data(), y2.data());
+    for (int i = 0; i < pmv->num_cols_f(); ++i) {
+      x1(i) = x2(i + pmv->num_cols_e()) = RandDouble();
+    }
 
-  for (int i = 0; i < pmv_->num_rows(); ++i) {
-    EXPECT_NEAR(y1(i), y2(i), kEpsilon);
+    Vector y1 = Vector::Zero(pmv->num_rows());
+    pmv->RightMultiplyAndAccumulateF(
+        x1.data(), y1.data(), &context_, kNumThreads);
+
+    Vector y2 = Vector::Zero(pmv->num_rows());
+    A->RightMultiplyAndAccumulate(x2.data(), y2.data());
+
+    for (int i = 0; i < pmv->num_rows(); ++i) {
+      EXPECT_NEAR(y1(i), y2(i), kEpsilon);
+    }
+  }
+}
+
+TEST_P(PartitionedMatrixViewParallelTest, LeftMultiplyAndAccumulateParallel) {
+  const int kNumThreads = GetParam();
+  for (int p = 0; p < kNumProblems; ++p) {
+    auto& pmv = pmv_[p];
+    auto& A = A_[p];
+    Vector x = Vector::Zero(pmv->num_rows());
+    for (int i = 0; i < pmv->num_rows(); ++i) {
+      x(i) = RandDouble();
+    }
+    Vector x_pre = x;
+
+    Vector y = Vector::Zero(pmv->num_cols());
+    Vector y1 = Vector::Zero(pmv->num_cols_e());
+    Vector y2 = Vector::Zero(pmv->num_cols_f());
+
+    A->LeftMultiplyAndAccumulate(x.data(), y.data());
+    pmv->LeftMultiplyAndAccumulateE(
+        x.data(), y1.data(), &context_, kNumThreads);
+    pmv->LeftMultiplyAndAccumulateF(
+        x.data(), y2.data(), &context_, kNumThreads);
+
+    for (int i = 0; i < pmv->num_cols(); ++i) {
+      EXPECT_NEAR(y(i),
+                  (i < pmv->num_cols_e()) ? y1(i) : y2(i - pmv->num_cols_e()),
+                  kEpsilon);
+    }
   }
 }