BlockRandomAccessMatrix Refactor

1. Add threading to all three subclasses of BlockRandomAccessMatrix.
   i.e. BlockRandomAccessDenseMatrix, BlockRandomAccessSparseMatrix
   and BlockRandomAccessDenseMatrix.

   For BlockRandomAccessDenseMatrix and BlockRandomAccessSparseMatrix
   this just means SetZero is parallelized. Which by itself is no
   big deal, but by doing so, the constructor for all three subclasses
   become uniform.

   BlockRandomAccessSparseMatrix::SymmetricRightMultiplyAndAccumulate
   maybe threaded in the future if needed.

   BlockRandomAccessDiagonalMatrix is the biggest beneficiary. SetZero
   Invert and RightMultiplyAndAccumulate are all threaded now.

2. Change the storage in BlockRandomAccessDiagonalMatrix from
   TripletSparseMatrix to CompressedRowSparseMatrix. This has no
   performance implications since we do not really use the capabilities
   of the underlying matrix indexing representation. This is a forward
   looking change when we decide to transfer this matrix to the GPU,
   a CompressedRowSparseMatrix will save on a data conversion.

3. Use std::unique_ptr as needed and eliminate the need for custom
   destructors.

4. Modify CompressedRowSparseMatrix::CreateBlockDiagonalMatrix to
   take a nullptr as the data vector.

Fixes https://github.com/ceres-solver/ceres-solver/issues/936
Fixes https://github.com/ceres-solver/ceres-solver/issues/935

Change-Id: Ia6487f2d924fbe669835bdcc38abf2b451bda4ee
diff --git a/internal/ceres/block_jacobi_preconditioner.cc b/internal/ceres/block_jacobi_preconditioner.cc
index 8136f73..0d4ad13 100644
--- a/internal/ceres/block_jacobi_preconditioner.cc
+++ b/internal/ceres/block_jacobi_preconditioner.cc
@@ -50,7 +50,7 @@
     Preconditioner::Options options, const BlockSparseMatrix& A)
     : options_(std::move(options)) {
   m_ = std::make_unique<BlockRandomAccessDiagonalMatrix>(
-      A.block_structure()->cols);
+      A.block_structure()->cols, options.context, options.num_threads);
 }
 
 BlockSparseJacobiPreconditioner::~BlockSparseJacobiPreconditioner() = default;
@@ -109,8 +109,6 @@
                 });
   }
 
-  // TODO(sameeragarwal): Once matrices are threaded, this call to invert should
-  // also be parallelized.
   m_->Invert();
   return true;
 }
diff --git a/internal/ceres/block_random_access_dense_matrix.cc b/internal/ceres/block_random_access_dense_matrix.cc
index 50eb972..deb0a80 100644
--- a/internal/ceres/block_random_access_dense_matrix.cc
+++ b/internal/ceres/block_random_access_dense_matrix.cc
@@ -34,13 +34,14 @@
 #include <vector>
 
 #include "ceres/internal/eigen.h"
+#include "ceres/parallel_for.h"
 #include "glog/logging.h"
 
 namespace ceres::internal {
 
 BlockRandomAccessDenseMatrix::BlockRandomAccessDenseMatrix(
-    std::vector<Block> blocks)
-    : blocks_(std::move(blocks)) {
+    std::vector<Block> blocks, ContextImpl* context, int num_threads)
+    : blocks_(std::move(blocks)), context_(context), num_threads_(num_threads) {
   const int num_blocks = blocks_.size();
   num_rows_ = NumScalarEntries(blocks_);
   values_ = std::make_unique<double[]>(num_rows_ * num_rows_);
@@ -52,10 +53,6 @@
   SetZero();
 }
 
-// Assume that the user does not hold any locks on any cell blocks
-// when they are calling SetZero.
-BlockRandomAccessDenseMatrix::~BlockRandomAccessDenseMatrix() = default;
-
 CellInfo* BlockRandomAccessDenseMatrix::GetCell(const int row_block_id,
                                                 const int col_block_id,
                                                 int* row,
@@ -72,9 +69,7 @@
 // Assume that the user does not hold any locks on any cell blocks
 // when they are calling SetZero.
 void BlockRandomAccessDenseMatrix::SetZero() {
-  if (num_rows_) {
-    VectorRef(values_.get(), num_rows_ * num_rows_).setZero();
-  }
+  ParallelSetZero(context_, num_threads_, values_.get(), num_rows_ * num_rows_);
 }
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/block_random_access_dense_matrix.h b/internal/ceres/block_random_access_dense_matrix.h
index 085a54d..d958ed1 100644
--- a/internal/ceres/block_random_access_dense_matrix.h
+++ b/internal/ceres/block_random_access_dense_matrix.h
@@ -36,6 +36,7 @@
 
 #include "ceres/block_random_access_matrix.h"
 #include "ceres/block_structure.h"
+#include "ceres/context_impl.h"
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"
 
@@ -56,13 +57,11 @@
  public:
   // blocks is a vector of block sizes. The resulting matrix has
   // blocks.size() * blocks.size() cells.
-  explicit BlockRandomAccessDenseMatrix(std::vector<Block> blocks);
-  BlockRandomAccessDenseMatrix(const BlockRandomAccessDenseMatrix&) = delete;
-  void operator=(const BlockRandomAccessDenseMatrix&) = delete;
+  explicit BlockRandomAccessDenseMatrix(std::vector<Block> blocks,
+                                        ContextImpl* context,
+                                        int num_threads);
 
-  // The destructor is not thread safe. It assumes that no one is
-  // modifying any cells when the matrix is being destroyed.
-  ~BlockRandomAccessDenseMatrix() override;
+  ~BlockRandomAccessDenseMatrix() override = default;
 
   // BlockRandomAccessMatrix interface.
   CellInfo* GetCell(int row_block_id,
@@ -72,8 +71,6 @@
                     int* row_stride,
                     int* col_stride) final;
 
-  // This is not a thread safe method, it assumes that no cell is
-  // locked.
   void SetZero() final;
 
   // Since the matrix is square with the same row and column block
@@ -86,8 +83,10 @@
   double* mutable_values() { return values_.get(); }
 
  private:
-  int num_rows_;
   std::vector<Block> blocks_;
+  ContextImpl* context_ = nullptr;
+  int num_threads_ = -1;
+  int num_rows_ = -1;
   std::unique_ptr<double[]> values_;
   std::unique_ptr<CellInfo[]> cell_infos_;
 };
diff --git a/internal/ceres/block_random_access_dense_matrix_test.cc b/internal/ceres/block_random_access_dense_matrix_test.cc
index b0cd5b9..2a3626f 100644
--- a/internal/ceres/block_random_access_dense_matrix_test.cc
+++ b/internal/ceres/block_random_access_dense_matrix_test.cc
@@ -38,12 +38,15 @@
 namespace ceres::internal {
 
 TEST(BlockRandomAccessDenseMatrix, GetCell) {
+  ContextImpl context;
+  constexpr int num_threads = 1;
+
   std::vector<Block> blocks;
   blocks.emplace_back(3, 0);
   blocks.emplace_back(4, 3);
   blocks.emplace_back(5, 7);
-  const int num_rows = 3 + 4 + 5;
-  BlockRandomAccessDenseMatrix m(blocks);
+  constexpr int num_rows = 3 + 4 + 5;
+  BlockRandomAccessDenseMatrix m(blocks, &context, num_threads);
   EXPECT_EQ(m.num_rows(), num_rows);
   EXPECT_EQ(m.num_cols(), num_rows);
 
@@ -67,13 +70,16 @@
 }
 
 TEST(BlockRandomAccessDenseMatrix, WriteCell) {
+  ContextImpl context;
+  constexpr int num_threads = 1;
+
   std::vector<Block> blocks;
   blocks.emplace_back(3, 0);
   blocks.emplace_back(4, 3);
   blocks.emplace_back(5, 7);
-  const int num_rows = 3 + 4 + 5;
+  constexpr int num_rows = 3 + 4 + 5;
 
-  BlockRandomAccessDenseMatrix m(blocks);
+  BlockRandomAccessDenseMatrix m(blocks, &context, num_threads);
 
   // Fill the cell (i,j) with (i + 1) * (j + 1)
   for (int i = 0; i < blocks.size(); ++i) {
diff --git a/internal/ceres/block_random_access_diagonal_matrix.cc b/internal/ceres/block_random_access_diagonal_matrix.cc
index 2e29604..fd109ba 100644
--- a/internal/ceres/block_random_access_diagonal_matrix.cc
+++ b/internal/ceres/block_random_access_diagonal_matrix.cc
@@ -37,49 +37,27 @@
 #include <vector>
 
 #include "Eigen/Dense"
+#include "ceres/compressed_row_sparse_matrix.h"
 #include "ceres/internal/export.h"
+#include "ceres/parallel_for.h"
 #include "ceres/stl_util.h"
-#include "ceres/triplet_sparse_matrix.h"
 #include "ceres/types.h"
 #include "glog/logging.h"
 
 namespace ceres::internal {
 
-// TODO(sameeragarwal): Drop the dependence on TripletSparseMatrix.
-
 BlockRandomAccessDiagonalMatrix::BlockRandomAccessDiagonalMatrix(
-    std::vector<Block> blocks)
-    : blocks_(std::move(blocks)) {
-  const int num_cols = NumScalarEntries(blocks_);
-  const int num_nonzeros = SumSquaredSizes(blocks_);
-  VLOG(1) << "Matrix Size [" << num_cols << "," << num_cols << "] "
-          << num_nonzeros;
-
-  tsm_ =
-      std::make_unique<TripletSparseMatrix>(num_cols, num_cols, num_nonzeros);
-  tsm_->set_num_nonzeros(num_nonzeros);
-  int* rows = tsm_->mutable_rows();
-  int* cols = tsm_->mutable_cols();
-  double* values = tsm_->mutable_values();
-
-  int pos = 0;
-  for (auto& block : blocks_) {
-    layout_.push_back(new CellInfo(values + pos));
-    for (int r = 0; r < block.size; ++r) {
-      for (int c = 0; c < block.size; ++c, ++pos) {
-        rows[pos] = block.position + r;
-        cols[pos] = block.position + c;
-      }
-    }
+    const std::vector<Block>& blocks, ContextImpl* context, int num_threads)
+    : context_(context), num_threads_(num_threads) {
+  m_ = CompressedRowSparseMatrix::CreateBlockDiagonalMatrix(nullptr, blocks);
+  double* values = m_->mutable_values();
+  layout_.reserve(blocks.size());
+  for (auto& block : blocks) {
+    layout_.emplace_back(std::make_unique<CellInfo>(values));
+    values += block.size * block.size;
   }
 }
 
-// Assume that the user does not hold any locks on any cell blocks
-// when they are calling SetZero.
-BlockRandomAccessDiagonalMatrix::~BlockRandomAccessDiagonalMatrix() {
-  STLDeleteContainerPointers(layout_.begin(), layout_.end());
-}
-
 CellInfo* BlockRandomAccessDiagonalMatrix::GetCell(int row_block_id,
                                                    int col_block_id,
                                                    int* row,
@@ -89,46 +67,51 @@
   if (row_block_id != col_block_id) {
     return nullptr;
   }
-  const int stride = blocks_[row_block_id].size;
+
+  auto& blocks = m_->row_blocks();
+  const int stride = blocks[row_block_id].size;
 
   // Each cell is stored contiguously as its own little dense matrix.
   *row = 0;
   *col = 0;
   *row_stride = stride;
   *col_stride = stride;
-  return layout_[row_block_id];
+  return layout_[row_block_id].get();
 }
 
 // Assume that the user does not hold any locks on any cell blocks
 // when they are calling SetZero.
 void BlockRandomAccessDiagonalMatrix::SetZero() {
-  if (tsm_->num_nonzeros()) {
-    VectorRef(tsm_->mutable_values(), tsm_->num_nonzeros()).setZero();
-  }
+  ParallelSetZero(
+      context_, num_threads_, m_->mutable_values(), m_->num_nonzeros());
 }
 
 void BlockRandomAccessDiagonalMatrix::Invert() {
-  double* values = tsm_->mutable_values();
-  for (auto& block : blocks_) {
-    MatrixRef b(values, block.size, block.size);
+  auto& blocks = m_->row_blocks();
+  const int num_blocks = blocks.size();
+  ParallelFor(context_, 0, num_blocks, num_threads_, [this, blocks](int i) {
+    auto* cell_info = layout_[i].get();
+    auto& block = blocks[i];
+    MatrixRef b(cell_info->values, block.size, block.size);
     b = b.selfadjointView<Eigen::Upper>().llt().solve(
         Matrix::Identity(block.size, block.size));
-    values += block.size * block.size;
-  }
+  });
 }
 
 void BlockRandomAccessDiagonalMatrix::RightMultiplyAndAccumulate(
     const double* x, double* y) const {
   CHECK(x != nullptr);
   CHECK(y != nullptr);
-  const double* values = tsm_->values();
-  for (auto& block : blocks_) {
-    ConstMatrixRef b(values, block.size, block.size);
-    VectorRef(y, block.size).noalias() += b * ConstVectorRef(x, block.size);
-    x += block.size;
-    y += block.size;
-    values += block.size * block.size;
-  }
+  auto& blocks = m_->row_blocks();
+  const int num_blocks = blocks.size();
+  ParallelFor(
+      context_, 0, num_blocks, num_threads_, [this, blocks, x, y](int i) {
+        auto* cell_info = layout_[i].get();
+        auto& block = blocks[i];
+        ConstMatrixRef b(cell_info->values, block.size, block.size);
+        VectorRef(y + block.position, block.size).noalias() +=
+            b * ConstVectorRef(x + block.position, block.size);
+      });
 }
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/block_random_access_diagonal_matrix.h b/internal/ceres/block_random_access_diagonal_matrix.h
index d058396..374dd82 100644
--- a/internal/ceres/block_random_access_diagonal_matrix.h
+++ b/internal/ceres/block_random_access_diagonal_matrix.h
@@ -32,33 +32,30 @@
 #define CERES_INTERNAL_BLOCK_RANDOM_ACCESS_DIAGONAL_MATRIX_H_
 
 #include <memory>
-#include <set>
 #include <utility>
 #include <vector>
 
 #include "ceres/block_random_access_matrix.h"
 #include "ceres/block_structure.h"
+#include "ceres/compressed_row_sparse_matrix.h"
+#include "ceres/context_impl.h"
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"
-#include "ceres/triplet_sparse_matrix.h"
 #include "ceres/types.h"
 
 namespace ceres::internal {
 
-// A thread safe block diagonal matrix implementation of
-// BlockRandomAccessMatrix.
+// A BlockRandomAccessMatrix which only stores the block diagonal.
+// BlockRandomAccessSparseMatrix can also be used to do this, but this class is
+// more efficient in time and in space.
 class CERES_NO_EXPORT BlockRandomAccessDiagonalMatrix
     : public BlockRandomAccessMatrix {
  public:
   // blocks is an array of block sizes.
-  explicit BlockRandomAccessDiagonalMatrix(std::vector<Block> blocks);
-  BlockRandomAccessDiagonalMatrix(const BlockRandomAccessDiagonalMatrix&) =
-      delete;
-  void operator=(const BlockRandomAccessDiagonalMatrix&) = delete;
-
-  // The destructor is not thread safe. It assumes that no one is
-  // modifying any cells when the matrix is being destroyed.
-  ~BlockRandomAccessDiagonalMatrix() override;
+  BlockRandomAccessDiagonalMatrix(const std::vector<Block>& blocks,
+                                  ContextImpl* context,
+                                  int num_threads);
+  ~BlockRandomAccessDiagonalMatrix() override = default;
 
   // BlockRandomAccessMatrix Interface.
   CellInfo* GetCell(int row_block_id,
@@ -68,32 +65,27 @@
                     int* row_stride,
                     int* col_stride) final;
 
-  // This is not a thread safe method, it assumes that no cell is
-  // locked.
+  // m = 0
   void SetZero() final;
 
-  // Invert the matrix assuming that each block is positive definite.
+  // m = m^{-1}
   void Invert();
 
-  // y += S * x
+  // y += m * x
   void RightMultiplyAndAccumulate(const double* x, double* y) const;
 
   // Since the matrix is square, num_rows() == num_cols().
-  int num_rows() const final { return tsm_->num_rows(); }
-  int num_cols() const final { return tsm_->num_cols(); }
+  int num_rows() const final { return m_->num_rows(); }
+  int num_cols() const final { return m_->num_cols(); }
 
-  const TripletSparseMatrix* matrix() const { return tsm_.get(); }
-  TripletSparseMatrix* mutable_matrix() { return tsm_.get(); }
+  const CompressedRowSparseMatrix* matrix() const { return m_.get(); }
+  CompressedRowSparseMatrix* mutable_matrix() { return m_.get(); }
 
  private:
-  // row/column block sizes.
-  const std::vector<Block> blocks_;
-  std::vector<CellInfo*> layout_;
-
-  // The underlying matrix object which actually stores the cells.
-  std::unique_ptr<TripletSparseMatrix> tsm_;
-
-  friend class BlockRandomAccessDiagonalMatrixTest;
+  ContextImpl* context_ = nullptr;
+  const int num_threads_ = 1;
+  std::unique_ptr<CompressedRowSparseMatrix> m_;
+  std::vector<std::unique_ptr<CellInfo>> layout_;
 };
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/block_random_access_diagonal_matrix_test.cc b/internal/ceres/block_random_access_diagonal_matrix_test.cc
index 5bf47ff..6ccc2f0 100644
--- a/internal/ceres/block_random_access_diagonal_matrix_test.cc
+++ b/internal/ceres/block_random_access_diagonal_matrix_test.cc
@@ -52,7 +52,8 @@
     const int num_rows = 3 + 4 + 5;
     num_nonzeros_ = 3 * 3 + 4 * 4 + 5 * 5;
 
-    m_ = std::make_unique<BlockRandomAccessDiagonalMatrix>(blocks);
+    m_ =
+        std::make_unique<BlockRandomAccessDiagonalMatrix>(blocks, &context_, 1);
 
     EXPECT_EQ(m_->num_rows(), num_rows);
     EXPECT_EQ(m_->num_cols(), num_rows);
@@ -97,17 +98,17 @@
   }
 
  protected:
+  ContextImpl context_;
   int num_nonzeros_;
   std::unique_ptr<BlockRandomAccessDiagonalMatrix> m_;
 };
 
 TEST_F(BlockRandomAccessDiagonalMatrixTest, MatrixContents) {
-  const TripletSparseMatrix* tsm = m_->matrix();
-  EXPECT_EQ(tsm->num_nonzeros(), num_nonzeros_);
-  EXPECT_EQ(tsm->max_num_nonzeros(), num_nonzeros_);
+  auto* crsm = m_->matrix();
+  EXPECT_EQ(crsm->num_nonzeros(), num_nonzeros_);
 
   Matrix dense;
-  tsm->ToDenseMatrix(&dense);
+  crsm->ToDenseMatrix(&dense);
 
   double kTolerance = 1e-14;
 
@@ -141,9 +142,9 @@
 
 TEST_F(BlockRandomAccessDiagonalMatrixTest, RightMultiplyAndAccumulate) {
   double kTolerance = 1e-14;
-  const TripletSparseMatrix* tsm = m_->matrix();
+  auto* crsm = m_->matrix();
   Matrix dense;
-  tsm->ToDenseMatrix(&dense);
+  crsm->ToDenseMatrix(&dense);
   Vector x = Vector::Random(dense.rows());
   Vector expected_y = dense * x;
   Vector actual_y = Vector::Zero(dense.rows());
@@ -153,14 +154,14 @@
 
 TEST_F(BlockRandomAccessDiagonalMatrixTest, Invert) {
   double kTolerance = 1e-14;
-  const TripletSparseMatrix* tsm = m_->matrix();
+  auto* crsm = m_->matrix();
   Matrix dense;
-  tsm->ToDenseMatrix(&dense);
+  crsm->ToDenseMatrix(&dense);
   Matrix expected_inverse =
       dense.llt().solve(Matrix::Identity(dense.rows(), dense.rows()));
 
   m_->Invert();
-  tsm->ToDenseMatrix(&dense);
+  crsm->ToDenseMatrix(&dense);
 
   EXPECT_NEAR((expected_inverse - dense).norm(), 0.0, kTolerance);
 }
diff --git a/internal/ceres/block_random_access_sparse_matrix.cc b/internal/ceres/block_random_access_sparse_matrix.cc
index f27fc08..784ffeb 100644
--- a/internal/ceres/block_random_access_sparse_matrix.cc
+++ b/internal/ceres/block_random_access_sparse_matrix.cc
@@ -37,6 +37,7 @@
 #include <vector>
 
 #include "ceres/internal/export.h"
+#include "ceres/parallel_for.h"
 #include "ceres/triplet_sparse_matrix.h"
 #include "ceres/types.h"
 #include "glog/logging.h"
@@ -45,8 +46,13 @@
 
 BlockRandomAccessSparseMatrix::BlockRandomAccessSparseMatrix(
     const std::vector<Block>& blocks,
-    const std::set<std::pair<int, int>>& block_pairs)
-    : kMaxRowBlocks(10 * 1000 * 1000), blocks_(blocks) {
+    const std::set<std::pair<int, int>>& block_pairs,
+    ContextImpl* context,
+    int num_threads)
+    : kMaxRowBlocks(10 * 1000 * 1000),
+      blocks_(blocks),
+      context_(context),
+      num_threads_(num_threads) {
   CHECK_LT(blocks.size(), kMaxRowBlocks);
 
   const int num_cols = NumScalarEntries(blocks);
@@ -77,7 +83,7 @@
     const int col_block_size = blocks_[block_pair.second].size;
     cell_values_.emplace_back(block_pair, values + pos);
     layout_[IntPairToLong(block_pair.first, block_pair.second)] =
-        new CellInfo(values + pos);
+        std::make_unique<CellInfo>(values + pos);
     pos += row_block_size * col_block_size;
   }
 
@@ -101,14 +107,6 @@
   }
 }
 
-// Assume that the user does not hold any locks on any cell blocks
-// when they are calling SetZero.
-BlockRandomAccessSparseMatrix::~BlockRandomAccessSparseMatrix() {
-  for (const auto& entry : layout_) {
-    delete entry.second;
-  }
-}
-
 CellInfo* BlockRandomAccessSparseMatrix::GetCell(int row_block_id,
                                                  int col_block_id,
                                                  int* row,
@@ -125,15 +123,14 @@
   *col = 0;
   *row_stride = blocks_[row_block_id].size;
   *col_stride = blocks_[col_block_id].size;
-  return it->second;
+  return it->second.get();
 }
 
 // Assume that the user does not hold any locks on any cell blocks
 // when they are calling SetZero.
 void BlockRandomAccessSparseMatrix::SetZero() {
-  if (tsm_->num_nonzeros()) {
-    VectorRef(tsm_->mutable_values(), tsm_->num_nonzeros()).setZero();
-  }
+  ParallelSetZero(
+      context_, num_threads_, tsm_->mutable_values(), tsm_->num_nonzeros());
 }
 
 void BlockRandomAccessSparseMatrix::SymmetricRightMultiplyAndAccumulate(
diff --git a/internal/ceres/block_random_access_sparse_matrix.h b/internal/ceres/block_random_access_sparse_matrix.h
index 7161e99..8aa13c6 100644
--- a/internal/ceres/block_random_access_sparse_matrix.h
+++ b/internal/ceres/block_random_access_sparse_matrix.h
@@ -40,6 +40,7 @@
 
 #include "ceres/block_random_access_matrix.h"
 #include "ceres/block_structure.h"
+#include "ceres/context_impl.h"
 #include "ceres/internal/disable_warnings.h"
 #include "ceres/internal/export.h"
 #include "ceres/small_blas.h"
@@ -60,13 +61,13 @@
   // of this matrix.
   BlockRandomAccessSparseMatrix(
       const std::vector<Block>& blocks,
-      const std::set<std::pair<int, int>>& block_pairs);
-  BlockRandomAccessSparseMatrix(const BlockRandomAccessSparseMatrix&) = delete;
-  void operator=(const BlockRandomAccessSparseMatrix&) = delete;
+      const std::set<std::pair<int, int>>& block_pairs,
+      ContextImpl* context,
+      int num_threads);
 
   // The destructor is not thread safe. It assumes that no one is
   // modifying any cells when the matrix is being destroyed.
-  ~BlockRandomAccessSparseMatrix() override;
+  ~BlockRandomAccessSparseMatrix() override = default;
 
   // BlockRandomAccessMatrix Interface.
   CellInfo* GetCell(int row_block_id,
@@ -108,14 +109,16 @@
 
   // row/column block sizes.
   const std::vector<Block> blocks_;
+  ContextImpl* context_ = nullptr;
+  const int num_threads_ = 1;
 
   // A mapping from <row_block_id, col_block_id> to the position in
   // the values array of tsm_ where the block is stored.
-  using LayoutType = std::unordered_map<long, CellInfo*>;
+  using LayoutType = std::unordered_map<long, std::unique_ptr<CellInfo>>;
   LayoutType layout_;
 
   // In order traversal of contents of the matrix. This allows us to
-  // implement a matrix-vector which is 20% faster than using the
+  // implement a matrix-vector product which is 20% faster than using the
   // iterator in the Layout object instead.
   std::vector<std::pair<std::pair<int, int>, double*>> cell_values_;
   // The underlying matrix object which actually stores the cells.
diff --git a/internal/ceres/block_random_access_sparse_matrix_test.cc b/internal/ceres/block_random_access_sparse_matrix_test.cc
index 78a6c3e..0e21a1e 100644
--- a/internal/ceres/block_random_access_sparse_matrix_test.cc
+++ b/internal/ceres/block_random_access_sparse_matrix_test.cc
@@ -43,11 +43,13 @@
 namespace ceres::internal {
 
 TEST(BlockRandomAccessSparseMatrix, GetCell) {
+  ContextImpl context;
+  constexpr int num_threads = 1;
   std::vector<Block> blocks;
   blocks.emplace_back(3, 0);
   blocks.emplace_back(4, 3);
   blocks.emplace_back(5, 7);
-  const int num_rows = 3 + 4 + 5;
+  constexpr int num_rows = 3 + 4 + 5;
 
   std::set<std::pair<int, int>> block_pairs;
   int num_nonzeros = 0;
@@ -63,7 +65,7 @@
   block_pairs.emplace(0, 2);
   num_nonzeros += blocks[2].size * blocks[0].size;
 
-  BlockRandomAccessSparseMatrix m(blocks, block_pairs);
+  BlockRandomAccessSparseMatrix m(blocks, block_pairs, &context, num_threads);
   EXPECT_EQ(m.num_rows(), num_rows);
   EXPECT_EQ(m.num_cols(), num_rows);
 
@@ -138,7 +140,8 @@
     blocks.emplace_back(1, 0);
     std::set<std::pair<int, int>> block_pairs;
     block_pairs.emplace(0, 0);
-    m_ = std::make_unique<BlockRandomAccessSparseMatrix>(blocks, block_pairs);
+    m_ = std::make_unique<BlockRandomAccessSparseMatrix>(
+        blocks, block_pairs, &context_, 1);
   }
 
   void CheckIntPairToLong(int a, int b) {
@@ -163,6 +166,7 @@
   }
 
  private:
+  ContextImpl context_;
   std::unique_ptr<BlockRandomAccessSparseMatrix> m_;
 };
 
diff --git a/internal/ceres/compressed_row_sparse_matrix.cc b/internal/ceres/compressed_row_sparse_matrix.cc
index 553dc21..890804f 100644
--- a/internal/ceres/compressed_row_sparse_matrix.cc
+++ b/internal/ceres/compressed_row_sparse_matrix.cc
@@ -578,7 +578,9 @@
   for (auto& block : blocks) {
     for (int r = 0; r < block.size; ++r) {
       *(rows++) = idx_cursor;
-      values[idx_cursor + r] = diagonal[col_cursor + r];
+      if (diagonal != nullptr) {
+        values[idx_cursor + r] = diagonal[col_cursor + r];
+      }
       for (int c = 0; c < block.size; ++c, ++idx_cursor) {
         *(cols++) = col_cursor + c;
       }
diff --git a/internal/ceres/implicit_schur_complement_test.cc b/internal/ceres/implicit_schur_complement_test.cc
index f0bbf6d..3fbfdf1 100644
--- a/internal/ceres/implicit_schur_complement_test.cc
+++ b/internal/ceres/implicit_schur_complement_test.cc
@@ -75,7 +75,7 @@
     const CompressedRowBlockStructure* bs = A_->block_structure();
     const int num_col_blocks = bs->cols.size();
     auto blocks = Tail(bs->cols, num_col_blocks - num_eliminate_blocks_);
-    BlockRandomAccessDenseMatrix blhs(blocks);
+    BlockRandomAccessDenseMatrix blhs(blocks, &context_, 1);
     const int num_schur_rows = blhs.num_rows();
 
     LinearSolver::Options options;
@@ -216,6 +216,7 @@
     return testing::AssertionSuccess();
   }
 
+  ContextImpl context_;
   int num_rows_;
   int num_cols_;
   int num_eliminate_blocks_;
diff --git a/internal/ceres/iterative_schur_complement_solver.cc b/internal/ceres/iterative_schur_complement_solver.cc
index ace075e..b4aa809 100644
--- a/internal/ceres/iterative_schur_complement_solver.cc
+++ b/internal/ceres/iterative_schur_complement_solver.cc
@@ -68,9 +68,8 @@
   EventLogger event_logger("IterativeSchurComplementSolver::Solve");
 
   CHECK(A->block_structure() != nullptr);
-  if (options_.num_threads != 1) {
-    A->AddTransposeBlockStructure();
-  }
+  CHECK(A->transpose_block_structure() != nullptr);
+
   const int num_eliminate_blocks = options_.elimination_groups[0];
   // Initialize a ImplicitSchurComplement object.
   if (schur_complement_ == nullptr) {
diff --git a/internal/ceres/schur_complement_solver.cc b/internal/ceres/schur_complement_solver.cc
index 2f7b87d..1972dc0 100644
--- a/internal/ceres/schur_complement_solver.cc
+++ b/internal/ceres/schur_complement_solver.cc
@@ -170,7 +170,8 @@
   const int num_eliminate_blocks = options().elimination_groups[0];
   const int num_col_blocks = bs->cols.size();
   auto blocks = Tail(bs->cols, num_col_blocks - num_eliminate_blocks);
-  set_lhs(std::make_unique<BlockRandomAccessDenseMatrix>(blocks));
+  set_lhs(std::make_unique<BlockRandomAccessDenseMatrix>(
+      blocks, options().context, options().num_threads));
   ResizeRhs(lhs()->num_rows());
 }
 
@@ -280,8 +281,8 @@
     }
   }
 
-  set_lhs(
-      std::make_unique<BlockRandomAccessSparseMatrix>(blocks_, block_pairs));
+  set_lhs(std::make_unique<BlockRandomAccessSparseMatrix>(
+      blocks_, block_pairs, options().context, options().num_threads));
   ResizeRhs(lhs()->num_rows());
 }
 
@@ -345,8 +346,8 @@
   CHECK_EQ(options().preconditioner_type, SCHUR_JACOBI);
 
   if (preconditioner_ == nullptr) {
-    preconditioner_ =
-        std::make_unique<BlockRandomAccessDiagonalMatrix>(blocks_);
+    preconditioner_ = std::make_unique<BlockRandomAccessDiagonalMatrix>(
+        blocks_, options().context, options().num_threads);
   }
 
   auto* sc = down_cast<BlockRandomAccessSparseMatrix*>(mutable_lhs());
diff --git a/internal/ceres/schur_eliminator_benchmark.cc b/internal/ceres/schur_eliminator_benchmark.cc
index 407dfd9..8478499 100644
--- a/internal/ceres/schur_eliminator_benchmark.cc
+++ b/internal/ceres/schur_eliminator_benchmark.cc
@@ -102,7 +102,7 @@
 
     std::vector<Block> blocks;
     blocks.emplace_back(kFBlockSize, 0);
-    lhs_ = std::make_unique<BlockRandomAccessDenseMatrix>(blocks);
+    lhs_ = std::make_unique<BlockRandomAccessDenseMatrix>(blocks, &context_, 1);
     diagonal_.resize(matrix_->num_cols());
     diagonal_.setOnes();
     rhs_.resize(kFBlockSize);
@@ -121,7 +121,11 @@
   Vector* mutable_y() { return &y_; }
   Vector* mutable_z() { return &z_; }
 
+  ContextImpl* context() { return &context_; }
+
  private:
+  ContextImpl context_;
+
   std::unique_ptr<BlockSparseMatrix> matrix_;
   Vector b_;
   std::unique_ptr<BlockRandomAccessDenseMatrix> lhs_;
@@ -137,12 +141,11 @@
   const int num_e_blocks = state.range(0);
   BenchmarkData data(num_e_blocks);
 
-  ContextImpl context;
   LinearSolver::Options linear_solver_options;
   linear_solver_options.e_block_size = kEBlockSize;
   linear_solver_options.row_block_size = kRowBlockSize;
   linear_solver_options.f_block_size = kFBlockSize;
-  linear_solver_options.context = &context;
+  linear_solver_options.context = data.context();
   std::unique_ptr<SchurEliminatorBase> eliminator(
       SchurEliminatorBase::Create(linear_solver_options));
 
@@ -160,12 +163,11 @@
   const int num_e_blocks = state.range(0);
   BenchmarkData data(num_e_blocks);
 
-  ContextImpl context;
   LinearSolver::Options linear_solver_options;
   linear_solver_options.e_block_size = kEBlockSize;
   linear_solver_options.row_block_size = kRowBlockSize;
   linear_solver_options.f_block_size = kFBlockSize;
-  linear_solver_options.context = &context;
+  linear_solver_options.context = data.context();
   std::unique_ptr<SchurEliminatorBase> eliminator(
       SchurEliminatorBase::Create(linear_solver_options));
 
diff --git a/internal/ceres/schur_eliminator_test.cc b/internal/ceres/schur_eliminator_test.cc
index b703477..6ae856f 100644
--- a/internal/ceres/schur_eliminator_test.cc
+++ b/internal/ceres/schur_eliminator_test.cc
@@ -127,7 +127,7 @@
     const CompressedRowBlockStructure* bs = A->block_structure();
     const int num_col_blocks = bs->cols.size();
     auto blocks = Tail(bs->cols, num_col_blocks - num_eliminate_blocks);
-    BlockRandomAccessDenseMatrix lhs(blocks);
+    BlockRandomAccessDenseMatrix lhs(blocks, &context_, 1);
 
     const int num_cols = A->num_cols();
     const int schur_size = lhs.num_rows();
@@ -135,8 +135,7 @@
     Vector rhs(schur_size);
 
     LinearSolver::Options options;
-    ContextImpl context;
-    options.context = &context;
+    options.context = &context_;
     options.elimination_groups.push_back(num_eliminate_blocks);
     if (use_static_structure) {
       DetectStructure(*bs,
@@ -178,6 +177,8 @@
                 relative_tolerance);
   }
 
+  ContextImpl context_;
+
   std::unique_ptr<BlockSparseMatrix> A;
   std::unique_ptr<double[]> b;
   std::unique_ptr<double[]> D;
@@ -224,6 +225,8 @@
   constexpr int kFBlockSize = 6;
   constexpr int num_e_blocks = 5;
 
+  ContextImpl context;
+
   auto* bs = new CompressedRowBlockStructure;
   bs->cols.resize(num_e_blocks + 1);
   int col_pos = 0;
@@ -294,8 +297,8 @@
 
   std::vector<Block> blocks;
   blocks.emplace_back(kFBlockSize, 0);
-  BlockRandomAccessDenseMatrix actual_lhs(blocks);
-  BlockRandomAccessDenseMatrix expected_lhs(blocks);
+  BlockRandomAccessDenseMatrix actual_lhs(blocks, &context, 1);
+  BlockRandomAccessDenseMatrix expected_lhs(blocks, &context, 1);
   Vector actual_rhs(kFBlockSize);
   Vector expected_rhs(kFBlockSize);
 
@@ -307,7 +310,6 @@
   expected_e_sol.setZero();
 
   {
-    ContextImpl context;
     LinearSolver::Options linear_solver_options;
     linear_solver_options.e_block_size = kEBlockSize;
     linear_solver_options.row_block_size = kRowBlockSize;
diff --git a/internal/ceres/schur_jacobi_preconditioner.cc b/internal/ceres/schur_jacobi_preconditioner.cc
index 9870924..78c51c8 100644
--- a/internal/ceres/schur_jacobi_preconditioner.cc
+++ b/internal/ceres/schur_jacobi_preconditioner.cc
@@ -59,7 +59,8 @@
     position += blocks[i].size;
   }
 
-  m_ = std::make_unique<BlockRandomAccessDiagonalMatrix>(blocks);
+  m_ = std::make_unique<BlockRandomAccessDiagonalMatrix>(
+      blocks, options.context, options.num_threads);
   InitEliminator(bs);
 }
 
diff --git a/internal/ceres/visibility_based_preconditioner.cc b/internal/ceres/visibility_based_preconditioner.cc
index 656bdbe..d2e4ee0 100644
--- a/internal/ceres/visibility_based_preconditioner.cc
+++ b/internal/ceres/visibility_based_preconditioner.cc
@@ -157,7 +157,8 @@
 void VisibilityBasedPreconditioner::InitStorage(
     const CompressedRowBlockStructure& bs) {
   ComputeBlockPairsInPreconditioner(bs);
-  m_ = std::make_unique<BlockRandomAccessSparseMatrix>(blocks_, block_pairs_);
+  m_ = std::make_unique<BlockRandomAccessSparseMatrix>(
+      blocks_, block_pairs_, options_.context, options_.num_threads);
 }
 
 // Call the canonical views algorithm and cluster the cameras based on