Parallel updates to block-diagonal EtE FtF

--------------------------------------------------------------------------------
Benchmark                                                                   Time
--------------------------------------------------------------------------------
PMVUpdateBlockDiagonalFtF<final/problem-13682-4456117-pre.txt>/1   5056275941 ns
PMVUpdateBlockDiagonalFtF<final/problem-13682-4456117-pre.txt>/2   3677677097 ns
PMVUpdateBlockDiagonalFtF<final/problem-13682-4456117-pre.txt>/4   1932236015 ns
PMVUpdateBlockDiagonalFtF<final/problem-13682-4456117-pre.txt>/8    984585015 ns
PMVUpdateBlockDiagonalFtF<final/problem-13682-4456117-pre.txt>/16   614918752 ns

PMVUpdateBlockDiagonalEtE<final/problem-13682-4456117-pre.txt>/1    449324491 ns
PMVUpdateBlockDiagonalEtE<final/problem-13682-4456117-pre.txt>/2    273147462 ns
PMVUpdateBlockDiagonalEtE<final/problem-13682-4456117-pre.txt>/4    150742698 ns
PMVUpdateBlockDiagonalEtE<final/problem-13682-4456117-pre.txt>/8     81602564 ns
PMVUpdateBlockDiagonalEtE<final/problem-13682-4456117-pre.txt>/16    47010769 ns

PMVUpdateBlockDiagonalFtF<venice/problem-1778-993923-pre.txt>/1     774598200 ns
PMVUpdateBlockDiagonalFtF<venice/problem-1778-993923-pre.txt>/2     611312877 ns
PMVUpdateBlockDiagonalFtF<venice/problem-1778-993923-pre.txt>/4     326701149 ns
PMVUpdateBlockDiagonalFtF<venice/problem-1778-993923-pre.txt>/8     165634457 ns
PMVUpdateBlockDiagonalFtF<venice/problem-1778-993923-pre.txt>/16     90631068 ns

PMVUpdateBlockDiagonalEtE<venice/problem-1778-993923-pre.txt>/1      80651817 ns
PMVUpdateBlockDiagonalEtE<venice/problem-1778-993923-pre.txt>/2      49688691 ns
PMVUpdateBlockDiagonalEtE<venice/problem-1778-993923-pre.txt>/4      27199153 ns
PMVUpdateBlockDiagonalEtE<venice/problem-1778-993923-pre.txt>/8      14301768 ns
PMVUpdateBlockDiagonalEtE<venice/problem-1778-993923-pre.txt>/16      8683479 ns

PMVUpdateBlockDiagonalFtF<ladybug/problem-1723-156502-pre.txt>/1    104422529 ns
PMVUpdateBlockDiagonalFtF<ladybug/problem-1723-156502-pre.txt>/2     81555176 ns
PMVUpdateBlockDiagonalFtF<ladybug/problem-1723-156502-pre.txt>/4     43227593 ns
PMVUpdateBlockDiagonalFtF<ladybug/problem-1723-156502-pre.txt>/8     22177895 ns
PMVUpdateBlockDiagonalFtF<ladybug/problem-1723-156502-pre.txt>/16    12505813 ns

PMVUpdateBlockDiagonalEtE<ladybug/problem-1723-156502-pre.txt>/1     14205253 ns
PMVUpdateBlockDiagonalEtE<ladybug/problem-1723-156502-pre.txt>/2      7102357 ns
PMVUpdateBlockDiagonalEtE<ladybug/problem-1723-156502-pre.txt>/4      3806598 ns
PMVUpdateBlockDiagonalEtE<ladybug/problem-1723-156502-pre.txt>/8      2112615 ns
PMVUpdateBlockDiagonalEtE<ladybug/problem-1723-156502-pre.txt>/16     1245771 ns

PMVUpdateBlockDiagonalFtF<dubrovnik/problem-356-226730-pre.txt>/1   190102870 ns
PMVUpdateBlockDiagonalFtF<dubrovnik/problem-356-226730-pre.txt>/2   157359897 ns
PMVUpdateBlockDiagonalFtF<dubrovnik/problem-356-226730-pre.txt>/4    82657662 ns
PMVUpdateBlockDiagonalFtF<dubrovnik/problem-356-226730-pre.txt>/8    42746490 ns
PMVUpdateBlockDiagonalFtF<dubrovnik/problem-356-226730-pre.txt>/16   23434967 ns

PMVUpdateBlockDiagonalEtE<dubrovnik/problem-356-226730-pre.txt>/1    18894355 ns
PMVUpdateBlockDiagonalEtE<dubrovnik/problem-356-226730-pre.txt>/2    12138228 ns
PMVUpdateBlockDiagonalEtE<dubrovnik/problem-356-226730-pre.txt>/4     6808771 ns
PMVUpdateBlockDiagonalEtE<dubrovnik/problem-356-226730-pre.txt>/8     3829718 ns
PMVUpdateBlockDiagonalEtE<dubrovnik/problem-356-226730-pre.txt>/16    2103688 ns

PMVUpdateBlockDiagonalFtF<trafalgar/problem-257-65132-pre.txt>/1     34230036 ns
PMVUpdateBlockDiagonalFtF<trafalgar/problem-257-65132-pre.txt>/2     20121184 ns
PMVUpdateBlockDiagonalFtF<trafalgar/problem-257-65132-pre.txt>/4     10899938 ns
PMVUpdateBlockDiagonalFtF<trafalgar/problem-257-65132-pre.txt>/8      6186359 ns
PMVUpdateBlockDiagonalFtF<trafalgar/problem-257-65132-pre.txt>/16     4207103 ns

PMVUpdateBlockDiagonalEtE<trafalgar/problem-257-65132-pre.txt>/1      3077110 ns
PMVUpdateBlockDiagonalEtE<trafalgar/problem-257-65132-pre.txt>/2      2224104 ns
PMVUpdateBlockDiagonalEtE<trafalgar/problem-257-65132-pre.txt>/4      1274841 ns
PMVUpdateBlockDiagonalEtE<trafalgar/problem-257-65132-pre.txt>/8       721140 ns
PMVUpdateBlockDiagonalEtE<trafalgar/problem-257-65132-pre.txt>/16      437715 ns

Change-Id: If5a342a063869bd9c0505bf96b6d957da5169c1d
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index 9849b5a..9717d0f 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc
@@ -147,19 +147,28 @@
 
   const PartitionedView* PartitionedMatrixViewJacobian(
       const LinearSolver::Options& options) {
-    auto block_sparse = BlockSparseJacobian(options.context);
+    auto block_sparse = BlockSparseJacobianWithTranspose(options.context);
     partitioned_view_jacobian =
         std::make_unique<PartitionedView>(options, *block_sparse);
     return partitioned_view_jacobian.get();
   }
 
-  const PartitionedView* PartitionedMatrixViewJacobianWithTranspose(
-      const LinearSolver::Options& options) {
-    auto block_sparse_transpose =
-        BlockSparseJacobianWithTranspose(options.context);
-    partitioned_view_jacobian_with_transpose =
-        std::make_unique<PartitionedView>(options, *block_sparse_transpose);
-    return partitioned_view_jacobian_with_transpose.get();
+  BlockSparseMatrix* BlockDiagonalEtE(const LinearSolver::Options& options) {
+    if (!block_diagonal_ete) {
+      auto partitioned_view =
+          PartitionedMatrixViewJacobian(options);
+      block_diagonal_ete = partitioned_view->CreateBlockDiagonalEtE();
+    }
+    return block_diagonal_ete.get();
+  }
+
+  BlockSparseMatrix* BlockDiagonalFtF(const LinearSolver::Options& options) {
+    if (!block_diagonal_ftf) {
+      auto partitioned_view =
+          PartitionedMatrixViewJacobian(options);
+      block_diagonal_ftf = partitioned_view->CreateBlockDiagonalFtF();
+    }
+    return block_diagonal_ftf.get();
   }
 
   Vector parameters;
@@ -169,7 +178,8 @@
   std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian_with_transpose;
   std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian;
   std::unique_ptr<PartitionedView> partitioned_view_jacobian;
-  std::unique_ptr<PartitionedView> partitioned_view_jacobian_with_transpose;
+  std::unique_ptr<BlockSparseMatrix> block_diagonal_ete;
+  std::unique_ptr<BlockSparseMatrix> block_diagonal_ftf;
 };
 
 static void Residuals(benchmark::State& state,
@@ -262,7 +272,7 @@
   options.num_threads = state.range(0);
   options.elimination_groups.push_back(data->bal_problem->num_points());
   options.context = context;
-  auto jacobian = data->PartitionedMatrixViewJacobianWithTranspose(options);
+  auto jacobian = data->PartitionedMatrixViewJacobian(options);
 
   Vector y = Vector::Zero(jacobian->num_cols_f());
   Vector x = Vector::Random(jacobian->num_rows());
@@ -298,7 +308,7 @@
   options.num_threads = state.range(0);
   options.elimination_groups.push_back(data->bal_problem->num_points());
   options.context = context;
-  auto jacobian = data->PartitionedMatrixViewJacobianWithTranspose(options);
+  auto jacobian = data->PartitionedMatrixViewJacobian(options);
 
   Vector y = Vector::Zero(jacobian->num_cols_e());
   Vector x = Vector::Random(jacobian->num_rows());
@@ -309,6 +319,36 @@
   CHECK_GT(y.squaredNorm(), 0.);
 }
 
+static void PMVUpdateBlockDiagonalEtE(benchmark::State& state,
+                                      BALData* data,
+                                      ContextImpl* context) {
+  LinearSolver::Options options;
+  options.num_threads = state.range(0);
+  options.elimination_groups.push_back(data->bal_problem->num_points());
+  options.context = context;
+  auto jacobian = data->PartitionedMatrixViewJacobian(options);
+  auto block_diagonal_ete = data->BlockDiagonalEtE(options);
+
+  for (auto _ : state) {
+    jacobian->UpdateBlockDiagonalEtE(block_diagonal_ete);
+  }
+}
+
+static void PMVUpdateBlockDiagonalFtF(benchmark::State& state,
+                                      BALData* data,
+                                      ContextImpl* context) {
+  LinearSolver::Options options;
+  options.num_threads = state.range(0);
+  options.elimination_groups.push_back(data->bal_problem->num_points());
+  options.context = context;
+  auto jacobian = data->PartitionedMatrixViewJacobian(options);
+  auto block_diagonal_ftf = data->BlockDiagonalFtF(options);
+
+  for (auto _ : state) {
+    jacobian->UpdateBlockDiagonalFtF(block_diagonal_ftf);
+  }
+}
+
 static void JacobianRightMultiplyAndAccumulate(benchmark::State& state,
                                                BALData* data,
                                                ContextImpl* context) {
@@ -485,6 +525,30 @@
         ->Arg(8)
         ->Arg(16);
 
+    const std::string name_update_block_diagonal_ftf =
+        "PMVUpdateBlockDiagonalFtF<" + path + ">";
+    ::benchmark::RegisterBenchmark(name_update_block_diagonal_ftf.c_str(),
+                                   ceres::internal::PMVUpdateBlockDiagonalFtF,
+                                   data,
+                                   &context)
+        ->Arg(1)
+        ->Arg(2)
+        ->Arg(4)
+        ->Arg(8)
+        ->Arg(16);
+
+    const std::string name_update_block_diagonal_ete =
+        "PMVUpdateBlockDiagonalEtE<" + path + ">";
+    ::benchmark::RegisterBenchmark(name_update_block_diagonal_ete.c_str(),
+                                   ceres::internal::PMVUpdateBlockDiagonalEtE,
+                                   data,
+                                   &context)
+        ->Arg(1)
+        ->Arg(2)
+        ->Arg(4)
+        ->Arg(8)
+        ->Arg(16);
+
 #ifndef CERES_NO_CUDA
     const std::string name_right_product_cuda =
         "JacobianRightMultiplyAndAccumulateCuda<" + path + ">";
diff --git a/internal/ceres/partitioned_matrix_view.h b/internal/ceres/partitioned_matrix_view.h
index f34fd4a..380b576 100644
--- a/internal/ceres/partitioned_matrix_view.h
+++ b/internal/ceres/partitioned_matrix_view.h
@@ -166,7 +166,15 @@
   std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalEtE() const final;
   std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalFtF() const final;
   void UpdateBlockDiagonalEtE(BlockSparseMatrix* block_diagonal) const final;
+  void UpdateBlockDiagonalEtESingleThreaded(
+      BlockSparseMatrix* block_diagonal) const;
+  void UpdateBlockDiagonalEtEMultiThreaded(
+      BlockSparseMatrix* block_diagonal) const;
   void UpdateBlockDiagonalFtF(BlockSparseMatrix* block_diagonal) const final;
+  void UpdateBlockDiagonalFtFSingleThreaded(
+      BlockSparseMatrix* block_diagonal) const;
+  void UpdateBlockDiagonalFtFMultiThreaded(
+      BlockSparseMatrix* block_diagonal) const;
   // clang-format off
   int num_col_blocks_e() const final { return num_col_blocks_e_;  }
   int num_col_blocks_f() const final { return num_col_blocks_f_;  }
diff --git a/internal/ceres/partitioned_matrix_view_impl.h b/internal/ceres/partitioned_matrix_view_impl.h
index 1ca2e14..e685f5b 100644
--- a/internal/ceres/partitioned_matrix_view_impl.h
+++ b/internal/ceres/partitioned_matrix_view_impl.h
@@ -444,10 +444,10 @@
 //
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
-    UpdateBlockDiagonalEtE(BlockSparseMatrix* block_diagonal) const {
-  const CompressedRowBlockStructure* bs = matrix_.block_structure();
-  const CompressedRowBlockStructure* block_diagonal_structure =
-      block_diagonal->block_structure();
+    UpdateBlockDiagonalEtESingleThreaded(
+        BlockSparseMatrix* block_diagonal) const {
+  auto bs = matrix_.block_structure();
+  auto block_diagonal_structure = block_diagonal->block_structure();
 
   block_diagonal->SetZero();
   const double* values = matrix_.values();
@@ -470,6 +470,57 @@
   }
 }
 
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    UpdateBlockDiagonalEtEMultiThreaded(
+        BlockSparseMatrix* block_diagonal) const {
+  auto transpose_block_structure = matrix_.transpose_block_structure();
+  CHECK(transpose_block_structure != nullptr);
+  auto block_diagonal_structure = block_diagonal->block_structure();
+
+  const double* values = matrix_.values();
+  double* values_diagonal = block_diagonal->mutable_values();
+  ParallelFor(
+      options_.context,
+      0,
+      num_col_blocks_e_,
+      options_.num_threads,
+      [values,
+       transpose_block_structure,
+       values_diagonal,
+       block_diagonal_structure](int col_block_id) {
+        int cell_position =
+            block_diagonal_structure->rows[col_block_id].cells[0].position;
+        double* cell_values = values_diagonal + cell_position;
+        int col_block_size =
+            transpose_block_structure->rows[col_block_id].block.size;
+        auto& cells = transpose_block_structure->rows[col_block_id].cells;
+        MatrixRef(cell_values, col_block_size, col_block_size).setZero();
+
+        for (auto& c : cells) {
+          int row_block_size = transpose_block_structure->cols[c.block_id].size;
+          // clang-format off
+          MatrixTransposeMatrixMultiply<kRowBlockSize, kEBlockSize, kRowBlockSize, kEBlockSize, 1>(
+            values + c.position, row_block_size, col_block_size,
+            values + c.position, row_block_size, col_block_size,
+            cell_values, 0, 0, col_block_size, col_block_size);
+          // clang-format on
+        }
+      },
+      e_cols_partition_);
+}
+
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    UpdateBlockDiagonalEtE(BlockSparseMatrix* block_diagonal) const {
+  if (options_.num_threads == 1) {
+    UpdateBlockDiagonalEtESingleThreaded(block_diagonal);
+  } else {
+    CHECK(options_.context != nullptr);
+    UpdateBlockDiagonalEtEMultiThreaded(block_diagonal);
+  }
+}
+
 // Similar to the code in RightMultiplyAndAccumulateF, except instead of the
 // matrix vector multiply its an outer product.
 //
@@ -477,10 +528,10 @@
 //
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
-    UpdateBlockDiagonalFtF(BlockSparseMatrix* block_diagonal) const {
-  const CompressedRowBlockStructure* bs = matrix_.block_structure();
-  const CompressedRowBlockStructure* block_diagonal_structure =
-      block_diagonal->block_structure();
+    UpdateBlockDiagonalFtFSingleThreaded(
+        BlockSparseMatrix* block_diagonal) const {
+  auto bs = matrix_.block_structure();
+  auto block_diagonal_structure = block_diagonal->block_structure();
 
   block_diagonal->SetZero();
   const double* values = matrix_.values();
@@ -527,4 +578,82 @@
   }
 }
 
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    UpdateBlockDiagonalFtFMultiThreaded(
+        BlockSparseMatrix* block_diagonal) const {
+  auto transpose_block_structure = matrix_.transpose_block_structure();
+  CHECK(transpose_block_structure != nullptr);
+  auto block_diagonal_structure = block_diagonal->block_structure();
+
+  const double* values = matrix_.values();
+  double* values_diagonal = block_diagonal->mutable_values();
+
+  const int num_col_blocks_e = num_col_blocks_e_;
+  const int num_row_blocks_e = num_row_blocks_e_;
+  ParallelFor(
+      options_.context,
+      num_col_blocks_e_,
+      num_col_blocks_e + num_col_blocks_f_,
+      options_.num_threads,
+      [transpose_block_structure,
+       block_diagonal_structure,
+       num_col_blocks_e,
+       num_row_blocks_e,
+       values,
+       values_diagonal](int col_block_id) {
+        const int col_block_size =
+            transpose_block_structure->rows[col_block_id].block.size;
+        const int diagonal_block_id = col_block_id - num_col_blocks_e;
+        const int cell_position =
+            block_diagonal_structure->rows[diagonal_block_id].cells[0].position;
+        double* cell_values = values_diagonal + cell_position;
+
+        MatrixRef(cell_values, col_block_size, col_block_size).setZero();
+
+        auto& cells = transpose_block_structure->rows[col_block_id].cells;
+        const int num_cells = cells.size();
+        int i = 0;
+        for (; i < num_cells; ++i) {
+          auto& cell = cells[i];
+          const int row_block_id = cell.block_id;
+          if (row_block_id >= num_row_blocks_e) break;
+          const int row_block_size =
+              transpose_block_structure->cols[row_block_id].size;
+          // clang-format off
+          MatrixTransposeMatrixMultiply
+              <kRowBlockSize, kFBlockSize, kRowBlockSize, kFBlockSize, 1>(
+                  values + cell.position, row_block_size, col_block_size,
+                  values + cell.position, row_block_size, col_block_size,
+                  cell_values, 0, 0, col_block_size, col_block_size);
+          // clang-format on
+        }
+        for (; i < num_cells; ++i) {
+          auto& cell = cells[i];
+          const int row_block_id = cell.block_id;
+          const int row_block_size =
+              transpose_block_structure->cols[row_block_id].size;
+          // clang-format off
+          MatrixTransposeMatrixMultiply
+              <Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, 1>(
+                  values + cell.position, row_block_size, col_block_size,
+                  values + cell.position, row_block_size, col_block_size,
+                  cell_values, 0, 0, col_block_size, col_block_size);
+          // clang-format on
+        }
+      },
+      f_cols_partition_);
+}
+
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+    UpdateBlockDiagonalFtF(BlockSparseMatrix* block_diagonal) const {
+  if (options_.num_threads == 1) {
+    UpdateBlockDiagonalFtFSingleThreaded(block_diagonal);
+  } else {
+    CHECK(options_.context != nullptr);
+    UpdateBlockDiagonalFtFMultiThreaded(block_diagonal);
+  }
+}
+
 }  // namespace ceres::internal
diff --git a/internal/ceres/partitioned_matrix_view_test.cc b/internal/ceres/partitioned_matrix_view_test.cc
index 3d10b1c..f74351e 100644
--- a/internal/ceres/partitioned_matrix_view_test.cc
+++ b/internal/ceres/partitioned_matrix_view_test.cc
@@ -49,63 +49,6 @@
 
 const double kEpsilon = 1e-14;
 
-class PartitionedMatrixViewTest : public ::testing::Test {
- protected:
-  void SetUp() final {
-    std::unique_ptr<LinearLeastSquaresProblem> problem =
-        CreateLinearLeastSquaresProblemFromId(2);
-    CHECK(problem != nullptr);
-    A_ = std::move(problem->A);
-
-    num_cols_ = A_->num_cols();
-    num_rows_ = A_->num_rows();
-    num_eliminate_blocks_ = problem->num_eliminate_blocks;
-    LinearSolver::Options options;
-    options.elimination_groups.push_back(num_eliminate_blocks_);
-    pmv_ = PartitionedMatrixViewBase::Create(
-        options, *down_cast<BlockSparseMatrix*>(A_.get()));
-  }
-
-  double RandDouble() { return distribution_(prng_); }
-
-  int num_rows_;
-  int num_cols_;
-  int num_eliminate_blocks_;
-  std::unique_ptr<SparseMatrix> A_;
-  std::unique_ptr<PartitionedMatrixViewBase> pmv_;
-  std::mt19937 prng_;
-  std::uniform_real_distribution<double> distribution_ =
-      std::uniform_real_distribution<double>(0.0, 1.0);
-};
-
-TEST_F(PartitionedMatrixViewTest, BlockDiagonalEtE) {
-  std::unique_ptr<BlockSparseMatrix> block_diagonal_ee(
-      pmv_->CreateBlockDiagonalEtE());
-  const CompressedRowBlockStructure* bs = block_diagonal_ee->block_structure();
-
-  EXPECT_EQ(block_diagonal_ee->num_rows(), 2);
-  EXPECT_EQ(block_diagonal_ee->num_cols(), 2);
-  EXPECT_EQ(bs->cols.size(), 2);
-  EXPECT_EQ(bs->rows.size(), 2);
-
-  EXPECT_NEAR(block_diagonal_ee->values()[0], 10.0, kEpsilon);
-  EXPECT_NEAR(block_diagonal_ee->values()[1], 155.0, kEpsilon);
-}
-
-TEST_F(PartitionedMatrixViewTest, BlockDiagonalFtF) {
-  std::unique_ptr<BlockSparseMatrix> block_diagonal_ff(
-      pmv_->CreateBlockDiagonalFtF());
-  const CompressedRowBlockStructure* bs = block_diagonal_ff->block_structure();
-
-  EXPECT_EQ(block_diagonal_ff->num_rows(), 3);
-  EXPECT_EQ(block_diagonal_ff->num_cols(), 3);
-  EXPECT_EQ(bs->cols.size(), 3);
-  EXPECT_EQ(bs->rows.size(), 3);
-  EXPECT_NEAR(block_diagonal_ff->values()[0], 70.0, kEpsilon);
-  EXPECT_NEAR(block_diagonal_ff->values()[1], 17.0, kEpsilon);
-  EXPECT_NEAR(block_diagonal_ff->values()[2], 37.0, kEpsilon);
-}
-
 // Param = <problem_id, num_threads>
 using Param = ::testing::tuple<int, int>;
 
@@ -116,7 +59,7 @@
   return ss.str();
 }
 
-class PartitionedMatrixViewSpMVTest : public ::testing::TestWithParam<Param> {
+class PartitionedMatrixViewTest : public ::testing::TestWithParam<Param> {
  protected:
   void SetUp() final {
     const int problem_id = ::testing::get<0>(GetParam());
@@ -132,6 +75,10 @@
     options_.elimination_groups.push_back(problem->num_eliminate_blocks);
     pmv_ = PartitionedMatrixViewBase::Create(options_, *block_sparse);
 
+    LinearSolver::Options options_single_threaded = options_;
+    options_single_threaded.num_threads = 1;
+    pmv_single_threaded_ = PartitionedMatrixViewBase::Create(options_, *block_sparse);
+
     EXPECT_EQ(pmv_->num_col_blocks_e(), problem->num_eliminate_blocks);
     EXPECT_EQ(pmv_->num_col_blocks_f(),
               block_sparse->block_structure()->cols.size() -
@@ -147,12 +94,13 @@
   std::unique_ptr<LinearLeastSquaresProblem> problem_;
   std::unique_ptr<SparseMatrix> A_;
   std::unique_ptr<PartitionedMatrixViewBase> pmv_;
+  std::unique_ptr<PartitionedMatrixViewBase> pmv_single_threaded_;
   std::mt19937 prng_;
   std::uniform_real_distribution<double> distribution_ =
       std::uniform_real_distribution<double>(0.0, 1.0);
 };
 
-TEST_P(PartitionedMatrixViewSpMVTest, RightMultiplyAndAccumulateE) {
+TEST_P(PartitionedMatrixViewTest, RightMultiplyAndAccumulateE) {
   Vector x1(pmv_->num_cols_e());
   Vector x2(pmv_->num_cols());
   x2.setZero();
@@ -172,7 +120,7 @@
   }
 }
 
-TEST_P(PartitionedMatrixViewSpMVTest, RightMultiplyAndAccumulateF) {
+TEST_P(PartitionedMatrixViewTest, RightMultiplyAndAccumulateF) {
   Vector x1(pmv_->num_cols_f());
   Vector x2(pmv_->num_cols());
   x2.setZero();
@@ -192,7 +140,7 @@
   }
 }
 
-TEST_P(PartitionedMatrixViewSpMVTest, LeftMultiplyAndAccumulate) {
+TEST_P(PartitionedMatrixViewTest, LeftMultiplyAndAccumulate) {
   Vector x = Vector::Zero(pmv_->num_rows());
   for (int i = 0; i < pmv_->num_rows(); ++i) {
     x(i) = RandDouble();
@@ -215,9 +163,109 @@
   }
 }
 
+TEST_P(PartitionedMatrixViewTest, BlockDiagonalFtF) {
+  std::unique_ptr<BlockSparseMatrix> block_diagonal_ff(
+      pmv_->CreateBlockDiagonalFtF());
+  const auto bs_diagonal = block_diagonal_ff->block_structure();
+  const int num_rows = pmv_->num_rows();
+  const int num_cols_f = pmv_->num_cols_f();
+  const int num_cols_e = pmv_->num_cols_e();
+  const int num_col_blocks_f = pmv_->num_col_blocks_f();
+  const int num_col_blocks_e = pmv_->num_col_blocks_e();
+
+  CHECK_EQ(block_diagonal_ff->num_rows(), num_cols_f);
+  CHECK_EQ(block_diagonal_ff->num_cols(), num_cols_f);
+
+  EXPECT_EQ(bs_diagonal->cols.size(), num_col_blocks_f);
+  EXPECT_EQ(bs_diagonal->rows.size(), num_col_blocks_f);
+
+  Matrix EF;
+  A_->ToDenseMatrix(&EF);
+  const auto F = EF.topRightCorner(num_rows, num_cols_f);
+
+  Matrix expected_FtF = F.transpose() * F;
+  Matrix actual_FtF;
+  block_diagonal_ff->ToDenseMatrix(&actual_FtF);
+
+  // FtF might be not block-diagonal
+  auto bs = down_cast<BlockSparseMatrix*>(A_.get())->block_structure();
+  for (int i = 0; i < num_col_blocks_f; ++i) {
+    const auto col_block_f = bs->cols[num_col_blocks_e + i];
+    const int block_size = col_block_f.size;
+    const int block_pos = col_block_f.position - num_cols_e;
+    const auto cell_expected =
+        expected_FtF.block(block_pos, block_pos, block_size, block_size);
+    auto cell_actual =
+        actual_FtF.block(block_pos, block_pos, block_size, block_size);
+    cell_actual -= cell_expected;
+    EXPECT_NEAR(cell_actual.norm(), 0., kEpsilon);
+  }
+  // There should be nothing remaining outside block-diagonal
+  EXPECT_NEAR(actual_FtF.norm(), 0., kEpsilon);
+}
+
+TEST_P(PartitionedMatrixViewTest, BlockDiagonalEtE) {
+  std::unique_ptr<BlockSparseMatrix> block_diagonal_ee(
+      pmv_->CreateBlockDiagonalEtE());
+  const CompressedRowBlockStructure* bs = block_diagonal_ee->block_structure();
+  const int num_rows = pmv_->num_rows();
+  const int num_cols_e = pmv_->num_cols_e();
+  const int num_col_blocks_e = pmv_->num_col_blocks_e();
+
+  CHECK_EQ(block_diagonal_ee->num_rows(), num_cols_e);
+  CHECK_EQ(block_diagonal_ee->num_cols(), num_cols_e);
+
+  EXPECT_EQ(bs->cols.size(), num_col_blocks_e);
+  EXPECT_EQ(bs->rows.size(), num_col_blocks_e);
+
+  Matrix EF;
+  A_->ToDenseMatrix(&EF);
+  const auto E = EF.topLeftCorner(num_rows, num_cols_e);
+
+  Matrix expected_EtE = E.transpose() * E;
+  Matrix actual_EtE;
+  block_diagonal_ee->ToDenseMatrix(&actual_EtE);
+
+  EXPECT_NEAR((expected_EtE - actual_EtE).norm(), 0., kEpsilon);
+}
+
+TEST_P(PartitionedMatrixViewTest, UpdateBlockDiagonalEtE) {
+  std::unique_ptr<BlockSparseMatrix> block_diagonal_ete(
+      pmv_->CreateBlockDiagonalEtE());
+  const CompressedRowBlockStructure* bs = block_diagonal_ete->block_structure();
+  const int num_cols = pmv_->num_cols_e();
+
+  Matrix multi_threaded(num_cols, num_cols);
+  pmv_->UpdateBlockDiagonalEtE(block_diagonal_ete.get());
+  block_diagonal_ete->ToDenseMatrix(&multi_threaded);
+
+  Matrix single_threaded(num_cols, num_cols);
+  pmv_single_threaded_->UpdateBlockDiagonalEtE(block_diagonal_ete.get());
+  block_diagonal_ete->ToDenseMatrix(&single_threaded);
+
+  EXPECT_NEAR((multi_threaded - single_threaded).norm(), 0., kEpsilon);
+}
+
+TEST_P(PartitionedMatrixViewTest, UpdateBlockDiagonalFtF) {
+  std::unique_ptr<BlockSparseMatrix> block_diagonal_ftf(
+      pmv_->CreateBlockDiagonalFtF());
+  const CompressedRowBlockStructure* bs = block_diagonal_ftf->block_structure();
+  const int num_cols = pmv_->num_cols_f();
+
+  Matrix multi_threaded(num_cols, num_cols);
+  pmv_->UpdateBlockDiagonalFtF(block_diagonal_ftf.get());
+  block_diagonal_ftf->ToDenseMatrix(&multi_threaded);
+
+  Matrix single_threaded(num_cols, num_cols);
+  pmv_single_threaded_->UpdateBlockDiagonalFtF(block_diagonal_ftf.get());
+  block_diagonal_ftf->ToDenseMatrix(&single_threaded);
+
+  EXPECT_NEAR((multi_threaded - single_threaded).norm(), 0., kEpsilon);
+}
+
 INSTANTIATE_TEST_SUITE_P(
     ParallelProducts,
-    PartitionedMatrixViewSpMVTest,
+    PartitionedMatrixViewTest,
     ::testing::Combine(::testing::Values(2, 4, 6),
                        ::testing::Values(1, 2, 3, 4, 5, 6, 7, 8)),
     ParamInfoToString);