Parallelize block_jacobi_preconditioner

Use ParallelFor to parallelize both versions of the
block Jacobi preconditioner. Also add benchmarks for
varying number of threads.

Benchmark on M1 Mac Pro

Before:
-----------------------------------------------------------------------------------------
Benchmark                                               Time             CPU   Iterations
-----------------------------------------------------------------------------------------
BM_BlockSparseJacobiPreconditionerBA             44847927 ns     44788313 ns           16
BM_BlockCRSJacobiPreconditionerBA                48772330 ns     48723571 ns           14
BM_BlockSparseJacobiPreconditionerUnstructured   62385231 ns     62306818 ns           11
BM_BlockCRSJacobiPreconditionerUnstructured      60671473 ns     60577727 ns           11

After:
--------------------------------------------------------------------------------------------
Benchmark                                                  Time             CPU   Iterations
--------------------------------------------------------------------------------------------
BM_BlockSparseJacobiPreconditionerBA/1              53314862 ns     53302308 ns           13
BM_BlockSparseJacobiPreconditionerBA/2              33601214 ns     33295143 ns           21
BM_BlockSparseJacobiPreconditionerBA/4              28162794 ns     27224167 ns           30
BM_BlockSparseJacobiPreconditionerBA/8              31402448 ns     28038760 ns           25
BM_BlockSparseJacobiPreconditionerBA/16             30820813 ns     22625233 ns           30
BM_BlockCRSJacobiPreconditionerBA/1                 60348194 ns     60332167 ns           12
BM_BlockCRSJacobiPreconditionerBA/2                 35489954 ns     34782050 ns           20
BM_BlockCRSJacobiPreconditionerBA/4                 23636360 ns     22547032 ns           31
BM_BlockCRSJacobiPreconditionerBA/8                 31688798 ns     27857800 ns           25
BM_BlockCRSJacobiPreconditionerBA/16                30806695 ns     20562516 ns           31
BM_BlockSparseJacobiPreconditionerUnstructured/1    59793396 ns     59788583 ns           12
BM_BlockSparseJacobiPreconditionerUnstructured/2    35192900 ns     34968900 ns           20
BM_BlockSparseJacobiPreconditionerUnstructured/4    30171145 ns     28924480 ns           25
BM_BlockSparseJacobiPreconditionerUnstructured/8    24982583 ns     23193172 ns           29
BM_BlockSparseJacobiPreconditionerUnstructured/16   23370546 ns     18389694 ns           36
BM_BlockCRSJacobiPreconditionerUnstructured/1       63204538 ns     63204545 ns           11
BM_BlockCRSJacobiPreconditionerUnstructured/2       34466060 ns     34193429 ns           21
BM_BlockCRSJacobiPreconditionerUnstructured/4       22712230 ns     20491147 ns           34
BM_BlockCRSJacobiPreconditionerUnstructured/8       16701833 ns     16190395 ns           43
BM_BlockCRSJacobiPreconditionerUnstructured/16      16762565 ns     12857304 ns           56

Note that single threaded performance gets worse. Performance goes up for 2 and 4 threads
and then essentially stalls.

Change-Id: I96a5d2f719545e14c03d73e71c8c0564e8c1c729
diff --git a/internal/ceres/block_jacobi_preconditioner.cc b/internal/ceres/block_jacobi_preconditioner.cc
index b452e7d..b7ee002 100644
--- a/internal/ceres/block_jacobi_preconditioner.cc
+++ b/internal/ceres/block_jacobi_preconditioner.cc
@@ -39,12 +39,14 @@
 #include "ceres/block_structure.h"
 #include "ceres/casts.h"
 #include "ceres/internal/eigen.h"
+#include "ceres/parallel_for.h"
 #include "ceres/small_blas.h"
 
 namespace ceres::internal {
 
 BlockSparseJacobiPreconditioner::BlockSparseJacobiPreconditioner(
-    const BlockSparseMatrix& A) {
+    Preconditioner::Options options, const BlockSparseMatrix& A)
+    : options_(std::move(options)) {
   m_ = std::make_unique<BlockRandomAccessDiagonalMatrix>(
       A.block_structure()->cols);
 }
@@ -56,48 +58,63 @@
   const CompressedRowBlockStructure* bs = A.block_structure();
   const double* values = A.values();
   m_->SetZero();
-  for (int i = 0; i < bs->rows.size(); ++i) {
-    const int row_block_size = bs->rows[i].block.size;
-    const std::vector<Cell>& cells = bs->rows[i].cells;
-    for (const auto& cell : cells) {
-      const int block_id = cell.block_id;
-      const int col_block_size = bs->cols[block_id].size;
 
-      int r, c, row_stride, col_stride;
-      CellInfo* cell_info =
-          m_->GetCell(block_id, block_id, &r, &c, &row_stride, &col_stride);
-      MatrixRef m(cell_info->values, row_stride, col_stride);
-      ConstMatrixRef b(values + cell.position, row_block_size, col_block_size);
-      // clang-format off
-      MatrixTransposeMatrixMultiply<Eigen::Dynamic, Eigen::Dynamic,
-          Eigen::Dynamic,Eigen::Dynamic, 1>(
-          values + cell.position, row_block_size,col_block_size,
-          values + cell.position, row_block_size,col_block_size,
-          cell_info->values,r, c,row_stride,col_stride);
-      // clang-format on
-    }
-  }
+  ParallelFor(options_.context,
+              0,
+              bs->rows.size(),
+              options_.num_threads,
+              [this, bs, values](int i) {
+                const int row_block_size = bs->rows[i].block.size;
+                const std::vector<Cell>& cells = bs->rows[i].cells;
+                for (const auto& cell : cells) {
+                  const int block_id = cell.block_id;
+                  const int col_block_size = bs->cols[block_id].size;
+                  int r, c, row_stride, col_stride;
+                  CellInfo* cell_info = m_->GetCell(
+                      block_id, block_id, &r, &c, &row_stride, &col_stride);
+                  MatrixRef m(cell_info->values, row_stride, col_stride);
+                  ConstMatrixRef b(
+                      values + cell.position, row_block_size, col_block_size);
+                  std::lock_guard<std::mutex> l(cell_info->m);
+                  // clang-format off
+                  MatrixTransposeMatrixMultiply<Eigen::Dynamic, Eigen::Dynamic,
+                   Eigen::Dynamic,Eigen::Dynamic, 1>(
+                   values + cell.position, row_block_size,col_block_size,
+                   values + cell.position, row_block_size,col_block_size,
+                   cell_info->values,r, c,row_stride,col_stride);
+                  // clang-format on
+                }
+              });
 
   if (D != nullptr) {
     // Add the diagonal.
-    int position = 0;
-    for (int i = 0; i < bs->cols.size(); ++i) {
-      const int block_size = bs->cols[i].size;
-      int r, c, row_stride, col_stride;
-      CellInfo* cell_info = m_->GetCell(i, i, &r, &c, &row_stride, &col_stride);
-      MatrixRef m(cell_info->values, row_stride, col_stride);
-      m.block(r, c, block_size, block_size).diagonal() +=
-          ConstVectorRef(D + position, block_size).array().square().matrix();
-      position += block_size;
-    }
+    ParallelFor(options_.context,
+                0,
+                bs->cols.size(),
+                options_.num_threads,
+                [this, bs, D](int i) {
+                  const int block_size = bs->cols[i].size;
+                  int r, c, row_stride, col_stride;
+                  CellInfo* cell_info =
+                      m_->GetCell(i, i, &r, &c, &row_stride, &col_stride);
+                  MatrixRef m(cell_info->values, row_stride, col_stride);
+                  m.block(r, c, block_size, block_size).diagonal() +=
+                      ConstVectorRef(D + bs->cols[i].position, block_size)
+                          .array()
+                          .square()
+                          .matrix();
+                });
   }
 
+  // TODO(sameeragarwal): Once matrices are threaded, this call to invert should
+  // also be parallelized.
   m_->Invert();
   return true;
 }
 
 BlockCRSJacobiPreconditioner::BlockCRSJacobiPreconditioner(
-    const CompressedRowSparseMatrix& A) {
+    Preconditioner::Options options, const CompressedRowSparseMatrix& A)
+    : options_(std::move(options)), locks_(A.col_blocks().size()) {
   auto& col_blocks = A.col_blocks();
 
   // Compute the number of non-zeros in the preconditioner. This is needed so
@@ -126,6 +143,12 @@
     }
   }
 
+  // In reality we only need num_col_blocks locks, however that would require
+  // that in UpdateImpl we are able to look up the column block from the it
+  // first column. To save ourselves this map we will instead spend a few extra
+  // lock objects.
+  std::vector<std::mutex> locks(A.num_cols());
+  locks_.swap(locks);
   CHECK_EQ(m_rows[A.num_cols()], m_nnz);
 }
 
@@ -141,49 +164,60 @@
   const int* a_rows = A.rows();
   const int* a_cols = A.cols();
   const double* a_values = A.values();
-
-  m_->SetZero();
   double* m_values = m_->mutable_values();
   const int* m_rows = m_->rows();
 
-  for (int i = 0; i < num_row_blocks; ++i) {
-    const int row = row_blocks[i].position;
-    const int row_block_size = row_blocks[i].size;
-    const int row_nnz = a_rows[row + 1] - a_rows[row];
-    ConstMatrixRef row_block(a_values + a_rows[row], row_block_size, row_nnz);
-    int c = 0;
-    while (c < row_nnz) {
-      const int idx = a_rows[row] + c;
-      const int col = a_cols[idx];
-      const int col_block_size = m_rows[col + 1] - m_rows[col];
+  m_->SetZero();
 
-      // We make use of the fact that the entire diagonal block is stored
-      // contiguously in memory as a row-major matrix.
-      MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size);
+  ParallelFor(
+      options_.context,
+      0,
+      num_row_blocks,
+      options_.num_threads,
+      [this, row_blocks, a_rows, a_cols, a_values, m_values, m_rows](int i) {
+        const int row = row_blocks[i].position;
+        const int row_block_size = row_blocks[i].size;
+        const int row_nnz = a_rows[row + 1] - a_rows[row];
+        ConstMatrixRef row_block(
+            a_values + a_rows[row], row_block_size, row_nnz);
+        int c = 0;
+        while (c < row_nnz) {
+          const int idx = a_rows[row] + c;
+          const int col = a_cols[idx];
+          const int col_block_size = m_rows[col + 1] - m_rows[col];
 
-      // We do not have a row_stride version of MatrixTransposeMatrixMultiply,
-      // otherwise we could use it here to further speed up the following
-      // expression.
-      auto b = row_block.middleCols(c, col_block_size);
-      m.noalias() += b.transpose() * b;
-      c += col_block_size;
-    }
-  }
+          // We make use of the fact that the entire diagonal block is
+          // stored contiguously in memory as a row-major matrix.
+          MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size);
+          // We do not have a row_stride version of
+          // MatrixTransposeMatrixMultiply, otherwise we could use it
+          // here to further speed up the following expression.
+          auto b = row_block.middleCols(c, col_block_size);
+          std::lock_guard<std::mutex> l(locks_[col]);
+          m.noalias() += b.transpose() * b;
+          c += col_block_size;
+        }
+      });
 
-  for (int i = 0; i < num_col_blocks; ++i) {
-    const int col = col_blocks[i].position;
-    const int col_block_size = col_blocks[i].size;
-    MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size);
+  ParallelFor(
+      options_.context,
+      0,
+      num_col_blocks,
+      options_.num_threads,
+      [col_blocks, m_rows, m_values, D](int i) {
+        const int col = col_blocks[i].position;
+        const int col_block_size = col_blocks[i].size;
+        MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size);
 
-    if (D != nullptr) {
-      m.diagonal() +=
-          ConstVectorRef(D + col, col_block_size).array().square().matrix();
-    }
+        if (D != nullptr) {
+          m.diagonal() +=
+              ConstVectorRef(D + col, col_block_size).array().square().matrix();
+        }
 
-    // TODO(sameeragarwal): Deal with Cholesky inversion failure here and
-    // elsewhere.
-    m = m.llt().solve(Matrix::Identity(col_block_size, col_block_size));
-  }
+        // TODO(sameeragarwal): Deal with Cholesky inversion failure here and
+        // elsewhere.
+        m = m.llt().solve(Matrix::Identity(col_block_size, col_block_size));
+      });
 
   return true;
 }
diff --git a/internal/ceres/block_jacobi_preconditioner.h b/internal/ceres/block_jacobi_preconditioner.h
index 265b7d6..a39ad12 100644
--- a/internal/ceres/block_jacobi_preconditioner.h
+++ b/internal/ceres/block_jacobi_preconditioner.h
@@ -52,7 +52,8 @@
     : public BlockSparseMatrixPreconditioner {
  public:
   // A must remain valid while the BlockJacobiPreconditioner is.
-  explicit BlockSparseJacobiPreconditioner(const BlockSparseMatrix& A);
+  explicit BlockSparseJacobiPreconditioner(Preconditioner::Options,
+                                           const BlockSparseMatrix& A);
   ~BlockSparseJacobiPreconditioner() override;
   void RightMultiplyAndAccumulate(const double* x, double* y) const final {
     return m_->RightMultiplyAndAccumulate(x, y);
@@ -64,6 +65,7 @@
  private:
   bool UpdateImpl(const BlockSparseMatrix& A, const double* D) final;
 
+  Preconditioner::Options options_;
   std::unique_ptr<BlockRandomAccessDiagonalMatrix> m_;
 };
 
@@ -73,7 +75,8 @@
     : public CompressedRowSparseMatrixPreconditioner {
  public:
   // A must remain valid while the BlockJacobiPreconditioner is.
-  explicit BlockCRSJacobiPreconditioner(const CompressedRowSparseMatrix& A);
+  explicit BlockCRSJacobiPreconditioner(Preconditioner::Options options,
+                                        const CompressedRowSparseMatrix& A);
   ~BlockCRSJacobiPreconditioner() override;
   void RightMultiplyAndAccumulate(const double* x, double* y) const final {
     m_->RightMultiplyAndAccumulate(x, y);
@@ -85,6 +88,8 @@
  private:
   bool UpdateImpl(const CompressedRowSparseMatrix& A, const double* D) final;
 
+  Preconditioner::Options options_;
+  std::vector<std::mutex> locks_;
   std::unique_ptr<CompressedRowSparseMatrix> m_;
 };
 
diff --git a/internal/ceres/block_jacobi_preconditioner_benchmark.cc b/internal/ceres/block_jacobi_preconditioner_benchmark.cc
index 1d8ad10..d2508c4 100644
--- a/internal/ceres/block_jacobi_preconditioner_benchmark.cc
+++ b/internal/ceres/block_jacobi_preconditioner_benchmark.cc
@@ -59,14 +59,26 @@
   std::mt19937 prng;
   auto jacobian = CreateFakeBundleAdjustmentJacobian(
       kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
-  BlockSparseJacobiPreconditioner p(*jacobian);
+
+  Preconditioner::Options preconditioner_options;
+  ContextImpl context;
+  preconditioner_options.context = &context;
+  preconditioner_options.num_threads = state.range(0);
+  context.EnsureMinimumThreads(preconditioner_options.num_threads);
+  BlockSparseJacobiPreconditioner p(preconditioner_options, *jacobian);
+
   Vector d = Vector::Ones(jacobian->num_cols());
   for (auto _ : state) {
     p.Update(*jacobian, d.data());
   }
 }
 
-BENCHMARK(BM_BlockSparseJacobiPreconditionerBA);
+BENCHMARK(BM_BlockSparseJacobiPreconditionerBA)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(4)
+    ->Arg(8)
+    ->Arg(16);
 
 static void BM_BlockCRSJacobiPreconditionerBA(benchmark::State& state) {
   std::mt19937 prng;
@@ -76,14 +88,25 @@
   CompressedRowSparseMatrix jacobian_crs(
       jacobian->num_rows(), jacobian->num_cols(), jacobian->num_nonzeros());
   jacobian->ToCompressedRowSparseMatrix(&jacobian_crs);
-  BlockCRSJacobiPreconditioner p(jacobian_crs);
+  Preconditioner::Options preconditioner_options;
+  ContextImpl context;
+  preconditioner_options.context = &context;
+  preconditioner_options.num_threads = state.range(0);
+  context.EnsureMinimumThreads(preconditioner_options.num_threads);
+  BlockCRSJacobiPreconditioner p(preconditioner_options, jacobian_crs);
+
   Vector d = Vector::Ones(jacobian_crs.num_cols());
   for (auto _ : state) {
     p.Update(jacobian_crs, d.data());
   }
 }
 
-BENCHMARK(BM_BlockCRSJacobiPreconditionerBA);
+BENCHMARK(BM_BlockCRSJacobiPreconditionerBA)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(4)
+    ->Arg(8)
+    ->Arg(16);
 
 static void BM_BlockSparseJacobiPreconditionerUnstructured(
     benchmark::State& state) {
@@ -98,14 +121,25 @@
   std::mt19937 prng;
 
   auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
-  BlockSparseJacobiPreconditioner p(*jacobian);
+  Preconditioner::Options preconditioner_options;
+  ContextImpl context;
+  preconditioner_options.context = &context;
+  preconditioner_options.num_threads = state.range(0);
+  context.EnsureMinimumThreads(preconditioner_options.num_threads);
+  BlockSparseJacobiPreconditioner p(preconditioner_options, *jacobian);
+
   Vector d = Vector::Ones(jacobian->num_cols());
   for (auto _ : state) {
     p.Update(*jacobian, d.data());
   }
 }
 
-BENCHMARK(BM_BlockSparseJacobiPreconditionerUnstructured);
+BENCHMARK(BM_BlockSparseJacobiPreconditionerUnstructured)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(4)
+    ->Arg(8)
+    ->Arg(16);
 
 static void BM_BlockCRSJacobiPreconditionerUnstructured(
     benchmark::State& state) {
@@ -123,13 +157,24 @@
   CompressedRowSparseMatrix jacobian_crs(
       jacobian->num_rows(), jacobian->num_cols(), jacobian->num_nonzeros());
   jacobian->ToCompressedRowSparseMatrix(&jacobian_crs);
-  BlockCRSJacobiPreconditioner p(jacobian_crs);
+  Preconditioner::Options preconditioner_options;
+  ContextImpl context;
+  preconditioner_options.context = &context;
+  preconditioner_options.num_threads = state.range(0);
+  context.EnsureMinimumThreads(preconditioner_options.num_threads);
+  BlockCRSJacobiPreconditioner p(preconditioner_options, jacobian_crs);
+
   Vector d = Vector::Ones(jacobian_crs.num_cols());
   for (auto _ : state) {
     p.Update(jacobian_crs, d.data());
   }
 }
-BENCHMARK(BM_BlockCRSJacobiPreconditionerUnstructured);
+BENCHMARK(BM_BlockCRSJacobiPreconditionerUnstructured)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(4)
+    ->Arg(8)
+    ->Arg(16);
 
 }  // namespace ceres::internal
 
diff --git a/internal/ceres/block_jacobi_preconditioner_test.cc b/internal/ceres/block_jacobi_preconditioner_test.cc
index fa051e4..eb3c9d0 100644
--- a/internal/ceres/block_jacobi_preconditioner_test.cc
+++ b/internal/ceres/block_jacobi_preconditioner_test.cc
@@ -55,6 +55,10 @@
   options.block_density = 0.25;
   std::mt19937 prng;
 
+  Preconditioner::Options preconditioner_options;
+  ContextImpl context;
+  preconditioner_options.context = &context;
+
   for (int trial = 0; trial < kNumtrials; ++trial) {
     auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
     Vector diagonal = Vector::Ones(jacobian->num_cols());
@@ -63,7 +67,7 @@
     Matrix hessian = dense_jacobian.transpose() * dense_jacobian;
     hessian.diagonal() += diagonal.array().square().matrix();
 
-    BlockSparseJacobiPreconditioner pre(*jacobian);
+    BlockSparseJacobiPreconditioner pre(preconditioner_options, *jacobian);
     pre.Update(*jacobian, diagonal.data());
 
     // The const_cast is needed to be able to call GetCell.
@@ -104,6 +108,10 @@
   options.block_density = 0.25;
   std::mt19937 prng;
 
+  Preconditioner::Options preconditioner_options;
+  ContextImpl context;
+  preconditioner_options.context = &context;
+
   for (int trial = 0; trial < kNumtrials; ++trial) {
     auto jacobian =
         CompressedRowSparseMatrix::CreateRandomMatrix(options, prng);
@@ -114,7 +122,7 @@
     Matrix hessian = dense_jacobian.transpose() * dense_jacobian;
     hessian.diagonal() += diagonal.array().square().matrix();
 
-    BlockCRSJacobiPreconditioner pre(*jacobian);
+    BlockCRSJacobiPreconditioner pre(preconditioner_options, *jacobian);
     pre.Update(*jacobian, diagonal.data());
     auto& m = pre.matrix();
 
diff --git a/internal/ceres/cgnr_solver.cc b/internal/ceres/cgnr_solver.cc
index bed4ad0..2215ab9 100644
--- a/internal/ceres/cgnr_solver.cc
+++ b/internal/ceres/cgnr_solver.cc
@@ -135,18 +135,20 @@
     double* x) {
   EventLogger event_logger("CgnrSolver::Solve");
   if (!preconditioner_) {
+    Preconditioner::Options preconditioner_options;
+    preconditioner_options.type = options_.preconditioner_type;
+    preconditioner_options.subset_preconditioner_start_row_block =
+        options_.subset_preconditioner_start_row_block;
+    preconditioner_options.sparse_linear_algebra_library_type =
+        options_.sparse_linear_algebra_library_type;
+    preconditioner_options.ordering_type = options_.ordering_type;
+    preconditioner_options.num_threads = options_.num_threads;
+    preconditioner_options.context = options_.context;
+
     if (options_.preconditioner_type == JACOBI) {
-      preconditioner_ = std::make_unique<BlockSparseJacobiPreconditioner>(*A);
+      preconditioner_ = std::make_unique<BlockSparseJacobiPreconditioner>(
+          preconditioner_options, *A);
     } else if (options_.preconditioner_type == SUBSET) {
-      Preconditioner::Options preconditioner_options;
-      preconditioner_options.type = SUBSET;
-      preconditioner_options.subset_preconditioner_start_row_block =
-          options_.subset_preconditioner_start_row_block;
-      preconditioner_options.sparse_linear_algebra_library_type =
-          options_.sparse_linear_algebra_library_type;
-      preconditioner_options.ordering_type = options_.ordering_type;
-      preconditioner_options.num_threads = options_.num_threads;
-      preconditioner_options.context = options_.context;
       preconditioner_ =
           std::make_unique<SubsetPreconditioner>(preconditioner_options, *A);
     } else {
@@ -238,9 +240,11 @@
 class CERES_NO_EXPORT CudaJacobiPreconditioner final
     : public CudaPreconditioner {
  public:
-  explicit CudaJacobiPreconditioner(ContextImpl* context,
+  explicit CudaJacobiPreconditioner(Preconditioner::Options options,
                                     const CompressedRowSparseMatrix& A)
-      : cpu_preconditioner_(A), m_(context, cpu_preconditioner_.matrix()) {}
+      : options_(std::move(options)),
+        cpu_preconditioner_(options_, A),
+        m_(options_->context, cpu_preconditioner_.matrix()) {}
   ~CudaJacobiPreconditioner() = default;
 
   void Update(const CompressedRowSparseMatrix& A, const double* D) final {
@@ -253,6 +257,7 @@
   }
 
  private:
+  Preconditioner::Options options_;
   BlockCRSJacobiPreconditioner cpu_preconditioner_;
   CudaSparseMatrix m_;
 };
@@ -297,9 +302,20 @@
     Atb_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
     Ax_ = std::make_unique<CudaVector>(options_.context, A.num_rows());
     D_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
+
+    Preconditioner::Options preconditioner_options;
+    preconditioner_options.type = options_.preconditioner_type;
+    preconditioner_options.subset_preconditioner_start_row_block =
+        options_.subset_preconditioner_start_row_block;
+    preconditioner_options.sparse_linear_algebra_library_type =
+        options_.sparse_linear_algebra_library_type;
+    preconditioner_options.ordering_type = options_.ordering_type;
+    preconditioner_options.num_threads = options_.num_threads;
+    preconditioner_options.context = options_.context;
+
     if (options_.preconditioner_type == JACOBI) {
       preconditioner_ =
-          std::make_unique<CudaJacobiPreconditioner>(options_.context, A);
+          std::make_unique<CudaJacobiPreconditioner>(preconditioner_options, A);
     } else {
       preconditioner_ = std::make_unique<CudaIdentityPreconditioner>();
     }