Parallel operations on vectors

Main focus of this change is to parallelize remaining operations (most of them
are operations on vectors) in code-path utilized with iterative Schur
complement.

Parallelization is handled using lazy evaluation of Eigen expressions.

On linux pc with intel 8176 processor parallelization of vector operations has
the following effect:

Running ./bin/parallel_vector_operations_benchmark
Run on (112 X 3200.32 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x56)
  L1 Instruction 32 KiB (x56)
  L2 Unified 1024 KiB (x56)
  L3 Unified 39424 KiB (x2)
Load Average: 3.30, 8.41, 11.82
-----------------------------------
Benchmark                      Time
-----------------------------------
SetZero                 10009532 ns
SetZeroParallel/1       10024139 ns
...
SetZeroParallel/16        877606 ns

Negate                   4978856 ns
NegateParallel/1         5145413 ns
...
NegateParallel/16         721823 ns

Assign                  10731408 ns
AssignParallel/1        10749944 ns
...
AssignParallel/16        1829381 ns

D2X                     15214399 ns
D2XParallel/1           15623245 ns
...
D2XParallel/16           2687060 ns

DivideSqrt               8220050 ns
DivideSqrtParallel/1     9088467 ns
...
DivideSqrtParallel/16     905569 ns

Clamp                    3502010 ns
ClampParallel/1          4507897 ns
...
ClampParallel/16          759576 ns

Norm                     4426782 ns
NormParallel/1           4442805 ns
...
NormParallel/16           430290 ns

Dot                      9023276 ns
DotParallel/1            9031304 ns
...
DotParallel/16           1157267 ns

Axpby                   14608289 ns
AxpbyParallel/1         14570825 ns
...
AxpbyParallel/16         2672220 ns
-----------------------------------

Multi-threading of vector operations in ISC and program evaluation results into
the following improvement:

Running ./bin/evaluation_benchmark
--------------------------------------------------------------------------------------
Benchmark                                                               this   2fd81de
--------------------------------------------------------------------------------------
Residuals<problem-13682-4456117-pre.txt>/1                           4136 ms   4292 ms
Residuals<problem-13682-4456117-pre.txt>/2                           2919 ms   2670 ms
Residuals<problem-13682-4456117-pre.txt>/4                           2065 ms   2198 ms
Residuals<problem-13682-4456117-pre.txt>/8                           1458 ms   1609 ms
Residuals<problem-13682-4456117-pre.txt>/16                          1152 ms   1227 ms

ResidualsAndJacobian<problem-13682-4456117-pre.txt>/1               19759 ms  20084 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/2               10921 ms  10977 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/4                6220 ms   6941 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/8                3490 ms   4398 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/16               2277 ms   3172 ms

Plus<problem-13682-4456117-pre.txt>/1                                 339 ms    322 ms
Plus<problem-13682-4456117-pre.txt>/2                                 220 ms
Plus<problem-13682-4456117-pre.txt>/4                                 128 ms
Plus<problem-13682-4456117-pre.txt>/8                                78.0 ms
Plus<problem-13682-4456117-pre.txt>/16                               49.8 ms

ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/1       2434 ms   2478 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/2       2706 ms   2688 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/4       1430 ms   1548 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/8        742 ms    883 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/16       438 ms    555 ms

ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/1   2438 ms   2481 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/2   2565 ms   2790 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/4   1434 ms   1551 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/8    765 ms    892 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/16   435 ms    559 ms

JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/1           1278 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/2           1555 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/4            833 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/8            459 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/16           250 ms

JacobianScaleColumns<problem-13682-4456117-pre.txt>/1                1468 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/2                1871 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/4                 957 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/8                 528 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/16                294 ms

End-to-end improvements with bundle_adjuster invoked with
./bin/bundle_adjuster --num_threads 28 --num_iterations 40 \
                      --linear_solver iterative_schur \
                      --preconditioner jacobi --input
---------------------------------------------
Problem                         this  2fd81de
---------------------------------------------
problem-13682-4456117-pre.txt  508.6    892.7
problem-1778-993923-pre.txt    763.8   1129.9
problem-1723-156502-pre.txt      6.3     14.4
problem-356-226730-pre.txt      76.3    116.2
problem-257-65132-pre.txt       38.6     52.0

Change-Id: Ie31cc5015f13fa479c16ffb5ce48c9b880990d49
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 1e6a699..23dbbd6 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -589,6 +589,9 @@
   add_executable(spmv_benchmark spmv_benchmark.cc)
   add_dependencies_to_benchmark(spmv_benchmark)
 
+  add_executable(parallel_vector_operations_benchmark parallel_vector_operations_benchmark.cc)
+  add_dependencies_to_benchmark(parallel_vector_operations_benchmark)
+
   add_executable(block_jacobi_preconditioner_benchmark
     block_jacobi_preconditioner_benchmark.cc)
   add_dependencies_to_benchmark(block_jacobi_preconditioner_benchmark)
diff --git a/internal/ceres/block_sparse_matrix.cc b/internal/ceres/block_sparse_matrix.cc
index 459d709..c81ea39 100644
--- a/internal/ceres/block_sparse_matrix.cc
+++ b/internal/ceres/block_sparse_matrix.cc
@@ -94,6 +94,7 @@
   values_ = std::make_unique<double[]>(num_nonzeros_);
   max_num_nonzeros_ = num_nonzeros_;
   CHECK(values_ != nullptr);
+  AddTransposeBlockStructure();
 }
 
 void BlockSparseMatrix::AddTransposeBlockStructure() {
@@ -106,6 +107,10 @@
   std::fill(values_.get(), values_.get() + num_nonzeros_, 0.0);
 }
 
+void BlockSparseMatrix::SetZero(ContextImpl* context, int num_threads) {
+  ParallelSetZero(context, num_threads, values_.get(), num_nonzeros_);
+}
+
 void BlockSparseMatrix::RightMultiplyAndAccumulate(const double* x,
                                                    double* y) const {
   RightMultiplyAndAccumulate(x, y, nullptr, 1);
@@ -148,6 +153,7 @@
               });
 }
 
+// TODO: This method might benefit from caching column-block partition
 void BlockSparseMatrix::LeftMultiplyAndAccumulate(const double* x,
                                                   double* y,
                                                   ContextImpl* context,
@@ -238,6 +244,40 @@
   }
 }
 
+// TODO: This method might benefit from caching column-block partition
+void BlockSparseMatrix::SquaredColumnNorm(double* x,
+                                          ContextImpl* context,
+                                          int num_threads) const {
+  if (transpose_block_structure_ == nullptr || num_threads == 1) {
+    SquaredColumnNorm(x);
+    return;
+  }
+
+  CHECK(x != nullptr);
+  ParallelSetZero(context, num_threads, x, num_cols_);
+
+  auto transpose_bs = transpose_block_structure_.get();
+  const auto values = values_.get();
+  const int num_col_blocks = transpose_bs->rows.size();
+  ParallelFor(
+      context,
+      0,
+      num_col_blocks,
+      num_threads,
+      [values, transpose_bs, x](int row_block_id) {
+        const auto& row = transpose_bs->rows[row_block_id];
+
+        for (auto& cell : row.cells) {
+          const auto& col = transpose_bs->cols[cell.block_id];
+          const MatrixRef m(values + cell.position, col.size, row.block.size);
+          VectorRef(x + row.block.position, row.block.size) +=
+              m.colwise().squaredNorm();
+        }
+      },
+      transpose_bs->rows.data(),
+      [](const CompressedRow& row) { return row.cumulative_nnz; });
+}
+
 void BlockSparseMatrix::ScaleColumns(const double* scale) {
   CHECK(scale != nullptr);
 
@@ -255,6 +295,38 @@
   }
 }
 
+// TODO: This method might benefit from caching column-block partition
+void BlockSparseMatrix::ScaleColumns(const double* scale,
+                                     ContextImpl* context,
+                                     int num_threads) {
+  if (transpose_block_structure_ == nullptr || num_threads == 1) {
+    ScaleColumns(scale);
+    return;
+  }
+
+  CHECK(scale != nullptr);
+  auto transpose_bs = transpose_block_structure_.get();
+  auto values = values_.get();
+  const int num_col_blocks = transpose_bs->rows.size();
+  ParallelFor(
+      context,
+      0,
+      num_col_blocks,
+      num_threads,
+      [values, transpose_bs, scale](int row_block_id) {
+        const auto& row = transpose_bs->rows[row_block_id];
+
+        for (auto& cell : row.cells) {
+          const auto& col = transpose_bs->cols[cell.block_id];
+          MatrixRef m(values + cell.position, col.size, row.block.size);
+          m *= ConstVectorRef(scale + row.block.position, row.block.size)
+                   .asDiagonal();
+        }
+      },
+      transpose_bs->rows.data(),
+      [](const CompressedRow& row) { return row.cumulative_nnz; });
+}
+
 void BlockSparseMatrix::ToCompressedRowSparseMatrix(
     CompressedRowSparseMatrix* crs_matrix) const {
   {
diff --git a/internal/ceres/block_sparse_matrix.h b/internal/ceres/block_sparse_matrix.h
index 72b1d68..ed3146f 100644
--- a/internal/ceres/block_sparse_matrix.h
+++ b/internal/ceres/block_sparse_matrix.h
@@ -71,7 +71,8 @@
   void operator=(const BlockSparseMatrix&) = delete;
 
   // Implementation of SparseMatrix interface.
-  void SetZero() final;
+  void SetZero() override final;
+  void SetZero(ContextImpl* context, int num_threads) override final;
   void RightMultiplyAndAccumulate(const double* x, double* y) const final;
   void RightMultiplyAndAccumulate(const double* x,
                                   double* y,
@@ -83,7 +84,13 @@
                                  ContextImpl* context,
                                  int num_threads) const final;
   void SquaredColumnNorm(double* x) const final;
+  void SquaredColumnNorm(double* x,
+                         ContextImpl* context,
+                         int num_threads) const final;
   void ScaleColumns(const double* scale) final;
+  void ScaleColumns(const double* scale,
+                    ContextImpl* context,
+                    int num_threads) final;
   void ToCompressedRowSparseMatrix(CompressedRowSparseMatrix* matrix) const;
   void ToDenseMatrix(Matrix* dense_matrix) const final;
   void ToTextFile(FILE* file) const final;
diff --git a/internal/ceres/block_sparse_matrix_test.cc b/internal/ceres/block_sparse_matrix_test.cc
index 6533a80..0529a2d 100644
--- a/internal/ceres/block_sparse_matrix_test.cc
+++ b/internal/ceres/block_sparse_matrix_test.cc
@@ -136,10 +136,23 @@
     CHECK_EQ(A_->num_cols(), B_->num_cols());
     CHECK_EQ(A_->num_nonzeros(), B_->num_nonzeros());
     context_.EnsureMinimumThreads(kNumThreads);
+
+    BlockSparseMatrix::RandomMatrixOptions options;
+    options.num_row_blocks = 1000;
+    options.min_row_block_size = 1;
+    options.max_row_block_size = 8;
+    options.num_col_blocks = 100;
+    options.min_col_block_size = 1;
+    options.max_col_block_size = 8;
+    options.block_density = 0.05;
+
+    std::mt19937 rng;
+    C_ = BlockSparseMatrix::CreateRandomMatrix(options, rng);
   }
 
   std::unique_ptr<BlockSparseMatrix> A_;
   std::unique_ptr<TripletSparseMatrix> B_;
+  std::unique_ptr<BlockSparseMatrix> C_;
   ContextImpl context_;
 };
 
@@ -187,14 +200,13 @@
 }
 
 TEST_F(BlockSparseMatrixTest, LeftMultiplyAndAccumulateParallelTest) {
-  Vector y_0 = Vector::Random(A_->num_rows());
+  Vector y_0 = Vector::Random(A_->num_cols());
   Vector y_s = y_0;
   Vector y_p = y_0;
 
-  Vector x = Vector::Random(A_->num_cols());
+  Vector x = Vector::Random(A_->num_rows());
   A_->LeftMultiplyAndAccumulate(x.data(), y_s.data());
 
-  A_->AddTransposeBlockStructure();
   A_->LeftMultiplyAndAccumulate(x.data(), y_p.data(), &context_, kNumThreads);
 
   // Parallel implementation for left products uses a different order of
@@ -210,6 +222,47 @@
   EXPECT_LT((y_a - y_b).norm(), 1e-12);
 }
 
+TEST_F(BlockSparseMatrixTest, SquaredColumnNormParallelTest) {
+  Vector y_a = Vector::Zero(C_->num_cols());
+  Vector y_b = Vector::Zero(C_->num_cols());
+  C_->SquaredColumnNorm(y_a.data());
+
+  C_->SquaredColumnNorm(y_b.data(), &context_, kNumThreads);
+  EXPECT_LT((y_a - y_b).norm(), 1e-12);
+}
+
+TEST_F(BlockSparseMatrixTest, ScaleColumnsTest) {
+  const Vector scale = Vector::Random(C_->num_cols()).cwiseAbs();
+
+  const Vector x = Vector::Random(C_->num_rows());
+  Vector y_expected = Vector::Zero(C_->num_cols());
+  C_->LeftMultiplyAndAccumulate(x.data(), y_expected.data());
+  y_expected.array() *= scale.array();
+
+  C_->ScaleColumns(scale.data());
+  Vector y_observed = Vector::Zero(C_->num_cols());
+  C_->LeftMultiplyAndAccumulate(x.data(), y_observed.data());
+
+  EXPECT_GT(y_expected.norm(), 1.);
+  EXPECT_LT((y_observed - y_expected).norm(), 1e-12 * y_expected.norm());
+}
+
+TEST_F(BlockSparseMatrixTest, ScaleColumnsParallelTest) {
+  const Vector scale = Vector::Random(C_->num_cols()).cwiseAbs();
+
+  const Vector x = Vector::Random(C_->num_rows());
+  Vector y_expected = Vector::Zero(C_->num_cols());
+  C_->LeftMultiplyAndAccumulate(x.data(), y_expected.data());
+  y_expected.array() *= scale.array();
+
+  C_->ScaleColumns(scale.data(), &context_, kNumThreads);
+  Vector y_observed = Vector::Zero(C_->num_cols());
+  C_->LeftMultiplyAndAccumulate(x.data(), y_observed.data());
+
+  EXPECT_GT(y_expected.norm(), 1.);
+  EXPECT_LT((y_observed - y_expected).norm(), 1e-12 * y_expected.norm());
+}
+
 TEST_F(BlockSparseMatrixTest, ToDenseMatrixTest) {
   Matrix m_a;
   Matrix m_b;
@@ -251,7 +304,6 @@
   std::unique_ptr<BlockSparseMatrix> m(
       down_cast<BlockSparseMatrix*>(problem->A.release()));
 
-  A_->AddTransposeBlockStructure();
   auto block_structure = A_->block_structure();
 
   // Several AppendRows and DeleteRowBlocks operations are applied to matrix,
@@ -486,7 +538,6 @@
     *ap_bs = *a->block_structure();
     BlockSparseMatrix ap(ap_bs.release());
     std::copy_n(a->values(), a->num_nonzeros(), ap.mutable_values());
-    ap.AddTransposeBlockStructure();
 
     Vector x = Vector::Random(a->num_cols());
     Vector y = Vector::Random(a->num_rows());
diff --git a/internal/ceres/cgnr_solver.cc b/internal/ceres/cgnr_solver.cc
index 524356a..a1a3c6e 100644
--- a/internal/ceres/cgnr_solver.cc
+++ b/internal/ceres/cgnr_solver.cc
@@ -104,7 +104,11 @@
     // y = y + DtDx
     if (D_ != nullptr) {
       int n = A_.num_cols();
-      y.array() += ConstVectorRef(D_, n).array().square() * x.array();
+      ParallelAssign(
+          context_,
+          num_threads_,
+          y,
+          y.array() + ConstVectorRef(D_, n).array().square() * x.array());
     }
   }
 
@@ -156,6 +160,8 @@
     preconditioner_options.context = options_.context;
 
     if (options_.preconditioner_type == JACOBI) {
+      // TODO: BlockSparseJacobiPreconditioner::RightMultiply will benefit from
+      // multithreading
       preconditioner_ = std::make_unique<BlockSparseJacobiPreconditioner>(
           preconditioner_options, *A);
     } else if (options_.preconditioner_type == SUBSET) {
@@ -165,9 +171,6 @@
       preconditioner_ = std::make_unique<IdentityPreconditioner>(A->num_cols());
     }
   }
-  if (options_.num_threads > 1) {
-    A->AddTransposeBlockStructure();
-  }
   preconditioner_->Update(*A, per_solve_options.D);
 
   ConjugateGradientsSolverOptions cg_options;
@@ -176,6 +179,8 @@
   cg_options.residual_reset_period = options_.residual_reset_period;
   cg_options.q_tolerance = per_solve_options.q_tolerance;
   cg_options.r_tolerance = per_solve_options.r_tolerance;
+  cg_options.context = options_.context;
+  cg_options.num_threads = options_.num_threads;
 
   // lhs = AtA + DtD
   CgnrLinearOperator lhs(
diff --git a/internal/ceres/conjugate_gradients_solver.h b/internal/ceres/conjugate_gradients_solver.h
index 0c3581b..7da63c0 100644
--- a/internal/ceres/conjugate_gradients_solver.h
+++ b/internal/ceres/conjugate_gradients_solver.h
@@ -82,6 +82,8 @@
   int residual_reset_period = 10;
   double r_tolerance = 0.0;
   double q_tolerance = 0.0;
+  ContextImpl* context = nullptr;
+  int num_threads = 1;
 };
 
 // This function implements the now classical Conjugate Gradients algorithm of
@@ -125,9 +127,9 @@
   summary.message = "Maximum number of iterations reached.";
   summary.num_iterations = 0;
 
-  const double norm_rhs = Norm(rhs);
+  const double norm_rhs = Norm(rhs, options.context, options.num_threads);
   if (norm_rhs == 0.0) {
-    SetZero(solution);
+    SetZero(solution, options.context, options.num_threads);
     summary.termination_type = LinearSolverTerminationType::SUCCESS;
     summary.message = "Convergence. |b| = 0.";
     return summary;
@@ -135,13 +137,13 @@
 
   const double tol_r = options.r_tolerance * norm_rhs;
 
-  SetZero(tmp);
+  SetZero(tmp, options.context, options.num_threads);
   lhs.RightMultiplyAndAccumulate(solution, tmp);
 
   // r = rhs - tmp
-  Axpby(1.0, rhs, -1.0, tmp, r);
+  Axpby(1.0, rhs, -1.0, tmp, r, options.context, options.num_threads);
 
-  double norm_r = Norm(r);
+  double norm_r = Norm(r, options.context, options.num_threads);
   if (options.min_num_iterations == 0 && norm_r <= tol_r) {
     summary.termination_type = LinearSolverTerminationType::SUCCESS;
     summary.message =
@@ -153,16 +155,16 @@
 
   // Initial value of the quadratic model Q = x'Ax - 2 * b'x.
   // double Q0 = -1.0 * solution.dot(rhs + r);
-  Axpby(1.0, rhs, 1.0, r, tmp);
-  double Q0 = -Dot(solution, tmp);
+  Axpby(1.0, rhs, 1.0, r, tmp, options.context, options.num_threads);
+  double Q0 = -Dot(solution, tmp, options.context, options.num_threads);
 
   for (summary.num_iterations = 1;; ++summary.num_iterations) {
-    SetZero(z);
+    SetZero(z, options.context, options.num_threads);
     preconditioner.RightMultiplyAndAccumulate(r, z);
 
     const double last_rho = rho;
     // rho = r.dot(z);
-    rho = Dot(r, z);
+    rho = Dot(r, z, options.context, options.num_threads);
     if (IsZeroOrInfinity(rho)) {
       summary.termination_type = LinearSolverTerminationType::FAILURE;
       summary.message = StringPrintf("Numerical failure. rho = r'z = %e.", rho);
@@ -170,7 +172,7 @@
     }
 
     if (summary.num_iterations == 1) {
-      Copy(z, p);
+      Copy(z, p, options.context, options.num_threads);
     } else {
       const double beta = rho / last_rho;
       if (IsZeroOrInfinity(beta)) {
@@ -184,21 +186,21 @@
         break;
       }
       // p = z + beta * p;
-      Axpby(1.0, z, beta, p, p);
+      Axpby(1.0, z, beta, p, p, options.context, options.num_threads);
     }
 
     DenseVectorType& q = z;
-    SetZero(q);
+    SetZero(q, options.context, options.num_threads);
     lhs.RightMultiplyAndAccumulate(p, q);
-    const double pq = Dot(p, q);
+    const double pq = Dot(p, q, options.context, options.num_threads);
     if ((pq <= 0) || std::isinf(pq)) {
       summary.termination_type = LinearSolverTerminationType::NO_CONVERGENCE;
       summary.message = StringPrintf(
           "Matrix is indefinite, no more progress can be made. "
           "p'q = %e. |p| = %e, |q| = %e",
           pq,
-          Norm(p),
-          Norm(q));
+          Norm(p, options.context, options.num_threads),
+          Norm(q, options.context, options.num_threads));
       break;
     }
 
@@ -214,7 +216,13 @@
     }
 
     // solution = solution + alpha * p;
-    Axpby(1.0, solution, alpha, p, solution);
+    Axpby(1.0,
+          solution,
+          alpha,
+          p,
+          solution,
+          options.context,
+          options.num_threads);
 
     // Ideally we would just use the update r = r - alpha*q to keep
     // track of the residual vector. However this estimate tends to
@@ -224,20 +232,20 @@
     // requires an additional matrix vector multiply which would
     // double the complexity of the CG algorithm.
     if (summary.num_iterations % options.residual_reset_period == 0) {
-      SetZero(tmp);
+      SetZero(tmp, options.context, options.num_threads);
       lhs.RightMultiplyAndAccumulate(solution, tmp);
-      Axpby(1.0, rhs, -1.0, tmp, r);
+      Axpby(1.0, rhs, -1.0, tmp, r, options.context, options.num_threads);
       // r = rhs - tmp;
     } else {
-      Axpby(1.0, r, -alpha, q, r);
+      Axpby(1.0, r, -alpha, q, r, options.context, options.num_threads);
       // r = r - alpha * q;
     }
 
     // Quadratic model based termination.
     //   Q1 = x'Ax - 2 * b' x.
     // const double Q1 = -1.0 * solution.dot(rhs + r);
-    Axpby(1.0, rhs, 1.0, r, tmp);
-    const double Q1 = -Dot(solution, tmp);
+    Axpby(1.0, rhs, 1.0, r, tmp, options.context, options.num_threads);
+    const double Q1 = -Dot(solution, tmp, options.context, options.num_threads);
 
     // For PSD matrices A, let
     //
@@ -270,13 +278,13 @@
                        summary.num_iterations,
                        zeta,
                        options.q_tolerance,
-                       Norm(r));
+                       Norm(r, options.context, options.num_threads));
       break;
     }
     Q0 = Q1;
 
     // Residual based termination.
-    norm_r = Norm(r);
+    norm_r = Norm(r, options.context, options.num_threads);
     if (norm_r <= tol_r &&
         summary.num_iterations >= options.min_num_iterations) {
       summary.termination_type = LinearSolverTerminationType::SUCCESS;
diff --git a/internal/ceres/cuda_vector.h b/internal/ceres/cuda_vector.h
index 985eac0..34f3947 100644
--- a/internal/ceres/cuda_vector.h
+++ b/internal/ceres/cuda_vector.h
@@ -121,13 +121,31 @@
 // Blas1 operations on Cuda vectors. These functions are needed as an
 // abstraction layer so that we can use different versions of a vector style
 // object in the conjugate gradients linear solver.
-inline double Norm(const CudaVector& x) { return x.Norm(); }
-inline void SetZero(CudaVector& x) { x.SetZero(); }
+// Context and num_threads arguments are not used by CUDA implementation,
+// context embedded into CudaVector is used instead.
+inline double Norm(const CudaVector& x,
+                   ContextImpl* context = nullptr,
+                   int num_threads = 1) {
+  (void)context;
+  (void)num_threads;
+  return x.Norm();
+}
+inline void SetZero(CudaVector& x,
+                    ContextImpl* context = nullptr,
+                    int num_threads = 1) {
+  (void)context;
+  (void)num_threads;
+  x.SetZero();
+}
 inline void Axpby(double a,
                   const CudaVector& x,
                   double b,
                   const CudaVector& y,
-                  CudaVector& z) {
+                  CudaVector& z,
+                  ContextImpl* context = nullptr,
+                  int num_threads = 1) {
+  (void)context;
+  (void)num_threads;
   if (&x == &y && &y == &z) {
     // z = (a + b) * z;
     z.Scale(a + b);
@@ -146,8 +164,22 @@
     z.Axpby(a, x, b);
   }
 }
-inline double Dot(const CudaVector& x, const CudaVector& y) { return x.Dot(y); }
-inline void Copy(const CudaVector& from, CudaVector& to) { to = from; }
+inline double Dot(const CudaVector& x,
+                  const CudaVector& y,
+                  ContextImpl* context = nullptr,
+                  int num_threads = 1) {
+  (void)context;
+  (void)num_threads;
+  return x.Dot(y);
+}
+inline void Copy(const CudaVector& from,
+                 CudaVector& to,
+                 ContextImpl* context = nullptr,
+                 int num_threads = 1) {
+  (void)context;
+  (void)num_threads;
+  to = from;
+}
 
 }  // namespace ceres::internal
 
diff --git a/internal/ceres/eigen_vector_ops.h b/internal/ceres/eigen_vector_ops.h
index 5bcf49d..91f2946 100644
--- a/internal/ceres/eigen_vector_ops.h
+++ b/internal/ceres/eigen_vector_ops.h
@@ -31,21 +31,65 @@
 #ifndef CERES_INTERNAL_EIGEN_VECTOR_OPS_H_
 #define CERES_INTERNAL_EIGEN_VECTOR_OPS_H_
 
+#include <numeric>
+
 #include "ceres/internal/eigen.h"
+#include "ceres/parallel_for.h"
 
 namespace ceres::internal {
 
 // Blas1 operations on Eigen vectors. These functions are needed as an
 // abstraction layer so that we can use different versions of a vector style
 // object in the conjugate gradients linear solver.
-inline double Norm(const Vector& x) { return x.norm(); }
-inline void SetZero(Vector& x) { x.setZero(); }
-inline void Axpby(
-    double a, const Vector& x, double b, const Vector& y, Vector& z) {
-  z = a * x + b * y;
+inline double Norm(const Vector& x, ContextImpl* context, int num_threads) {
+  std::vector<double> norms(num_threads);
+  ParallelFor(context,
+              0,
+              x.rows(),
+              num_threads,
+              [&x, &norms](int thread_id, std::tuple<int, int> range) {
+                auto [start, end] = range;
+                norms[thread_id] += x.segment(start, end - start).squaredNorm();
+              });
+  return std::sqrt(std::accumulate(norms.begin(), norms.end(), 0.));
 }
-inline double Dot(const Vector& x, const Vector& y) { return x.dot(y); }
-inline void Copy(const Vector& from, Vector& to) { to = from; }
+inline void SetZero(Vector& x, ContextImpl* context, int num_threads) {
+  ParallelSetZero(context, num_threads, x);
+}
+inline void Axpby(double a,
+                  const Vector& x,
+                  double b,
+                  const Vector& y,
+                  Vector& z,
+                  ContextImpl* context,
+                  int num_threads) {
+  ParallelAssign(context, num_threads, z, a * x + b * y);
+}
+template <typename VectorLikeX, typename VectorLikeY>
+inline double Dot(const VectorLikeX& x,
+                  const VectorLikeY& y,
+                  ContextImpl* context,
+                  int num_threads) {
+  std::vector<double> dots(num_threads);
+  ParallelFor(context,
+              0,
+              x.rows(),
+              num_threads,
+              [&x, &y, &dots](int thread_id, std::tuple<int, int> range) {
+                auto [start, end] = range;
+                const int block_size = end - start;
+                const auto& x_block = x.segment(start, block_size);
+                const auto& y_block = y.segment(start, block_size);
+                dots[thread_id] += x_block.dot(y_block);
+              });
+  return std::accumulate(dots.begin(), dots.end(), 0.);
+}
+inline void Copy(const Vector& from,
+                 Vector& to,
+                 ContextImpl* context,
+                 int num_threads) {
+  ParallelAssign(context, num_threads, to, from);
+}
 
 }  // namespace ceres::internal
 
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index 6f0591e..be784c2 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc
@@ -39,6 +39,7 @@
 #include "ceres/cuda_sparse_matrix.h"
 #include "ceres/cuda_vector.h"
 #include "ceres/evaluator.h"
+#include "ceres/implicit_schur_complement.h"
 #include "ceres/partitioned_matrix_view.h"
 #include "ceres/preprocessor.h"
 #include "ceres/problem.h"
@@ -76,6 +77,21 @@
 
     parameters.resize(program->NumParameters());
     program->ParameterBlocksToStateVector(parameters.data());
+
+    const int num_residuals = program->NumResiduals();
+    b.resize(num_residuals);
+
+    std::mt19937 rng;
+    std::normal_distribution<double> rnorm;
+    for (int i = 0; i < num_residuals; ++i) {
+      b[i] = rnorm(rng);
+    }
+
+    const int num_parameters = program->NumParameters();
+    D.resize(num_parameters);
+    for (int i = 0; i < num_parameters; ++i) {
+      D[i] = rnorm(rng);
+    }
   }
 
   std::unique_ptr<BlockSparseMatrix> CreateBlockSparseJacobian(
@@ -128,15 +144,6 @@
     return block_sparse_jacobian.get();
   }
 
-  const BlockSparseMatrix* BlockSparseJacobianWithTranspose(
-      ContextImpl* context) {
-    if (!block_sparse_jacobian_with_transpose) {
-      block_sparse_jacobian_with_transpose = CreateBlockSparseJacobian(context);
-      block_sparse_jacobian_with_transpose->AddTransposeBlockStructure();
-    }
-    return block_sparse_jacobian_with_transpose.get();
-  }
-
   const CompressedRowSparseMatrix* CompressedRowSparseJacobian(
       ContextImpl* context) {
     if (!crs_jacobian) {
@@ -147,14 +154,13 @@
 
   std::unique_ptr<PartitionedView> PartitionedMatrixViewJacobian(
       const LinearSolver::Options& options) {
-    auto block_sparse = BlockSparseJacobianWithTranspose(options.context);
+    auto block_sparse = BlockSparseJacobian(options.context);
     return std::make_unique<PartitionedView>(options, *block_sparse);
   }
 
   BlockSparseMatrix* BlockDiagonalEtE(const LinearSolver::Options& options) {
     if (!block_diagonal_ete) {
-      auto partitioned_view =
-          PartitionedMatrixViewJacobian(options);
+      auto partitioned_view = PartitionedMatrixViewJacobian(options);
       block_diagonal_ete = partitioned_view->CreateBlockDiagonalEtE();
     }
     return block_diagonal_ete.get();
@@ -162,21 +168,41 @@
 
   BlockSparseMatrix* BlockDiagonalFtF(const LinearSolver::Options& options) {
     if (!block_diagonal_ftf) {
-      auto partitioned_view =
-          PartitionedMatrixViewJacobian(options);
+      auto partitioned_view = PartitionedMatrixViewJacobian(options);
       block_diagonal_ftf = partitioned_view->CreateBlockDiagonalFtF();
     }
     return block_diagonal_ftf.get();
   }
 
+  const ImplicitSchurComplement* ImplicitSchurComplementWithoutDiagonal(
+      const LinearSolver::Options& options) {
+    auto block_sparse = BlockSparseJacobian(options.context);
+    implicit_schur_complement =
+        std::make_unique<ImplicitSchurComplement>(options);
+    implicit_schur_complement->Init(*block_sparse, nullptr, b.data());
+    return implicit_schur_complement.get();
+  }
+
+  const ImplicitSchurComplement* ImplicitSchurComplementWithDiagonal(
+      const LinearSolver::Options& options) {
+    auto block_sparse = BlockSparseJacobian(options.context);
+    implicit_schur_complement_diag =
+        std::make_unique<ImplicitSchurComplement>(options);
+    implicit_schur_complement_diag->Init(*block_sparse, D.data(), b.data());
+    return implicit_schur_complement_diag.get();
+  }
+
   Vector parameters;
+  Vector D;
+  Vector b;
   std::unique_ptr<BundleAdjustmentProblem> bal_problem;
   std::unique_ptr<PreprocessedProblem> preprocessed_problem;
   std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian;
-  std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian_with_transpose;
   std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian;
   std::unique_ptr<BlockSparseMatrix> block_diagonal_ete;
   std::unique_ptr<BlockSparseMatrix> block_diagonal_ftf;
+  std::unique_ptr<ImplicitSchurComplement> implicit_schur_complement;
+  std::unique_ptr<ImplicitSchurComplement> implicit_schur_complement_diag;
 };
 
 static void Residuals(benchmark::State& state,
@@ -244,6 +270,32 @@
   }
 }
 
+static void Plus(benchmark::State& state, BALData* data, ContextImpl* context) {
+  const int num_threads = state.range(0);
+
+  Evaluator::Options options;
+  options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
+  options.num_threads = num_threads;
+  options.context = context;
+  options.num_eliminate_blocks = 0;
+
+  std::string error;
+  CHECK(data->preprocessed_problem != nullptr);
+  auto program = data->preprocessed_problem->reduced_program.get();
+  CHECK(program != nullptr);
+  auto evaluator = Evaluator::Create(options, program, &error);
+  CHECK(evaluator != nullptr);
+
+  Vector state_plus_delta = Vector::Zero(program->NumParameters());
+  Vector delta = Vector::Random(program->NumEffectiveParameters());
+
+  for (auto _ : state) {
+    CHECK(evaluator->Plus(
+        data->parameters.data(), delta.data(), state_plus_delta.data()));
+  }
+  CHECK_GT(state_plus_delta.squaredNorm(), 0.);
+}
+
 static void PMVRightMultiplyAndAccumulateF(benchmark::State& state,
                                            BALData* data,
                                            ContextImpl* context) {
@@ -346,6 +398,71 @@
   }
 }
 
+static void ISCRightMultiplyNoDiag(benchmark::State& state,
+                                   BALData* data,
+                                   ContextImpl* context) {
+  LinearSolver::Options options;
+  options.num_threads = state.range(0);
+  options.elimination_groups.push_back(data->bal_problem->num_points());
+  options.context = context;
+  auto jacobian = data->ImplicitSchurComplementWithoutDiagonal(options);
+
+  Vector y = Vector::Zero(jacobian->num_rows());
+  Vector x = Vector::Random(jacobian->num_cols());
+  for (auto _ : state) {
+    jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
+  }
+  CHECK_GT(y.squaredNorm(), 0.);
+}
+
+static void ISCRightMultiplyDiag(benchmark::State& state,
+                                 BALData* data,
+                                 ContextImpl* context) {
+  LinearSolver::Options options;
+  options.num_threads = state.range(0);
+  options.elimination_groups.push_back(data->bal_problem->num_points());
+  options.context = context;
+
+  auto jacobian = data->ImplicitSchurComplementWithDiagonal(options);
+
+  Vector y = Vector::Zero(jacobian->num_rows());
+  Vector x = Vector::Random(jacobian->num_cols());
+  for (auto _ : state) {
+    jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
+  }
+  CHECK_GT(y.squaredNorm(), 0.);
+}
+
+static void JacobianSquaredColumnNorm(benchmark::State& state,
+                                      BALData* data,
+                                      ContextImpl* context) {
+  const int num_threads = state.range(0);
+
+  auto jacobian = data->BlockSparseJacobian(context);
+
+  Vector x = Vector::Zero(jacobian->num_cols());
+
+  for (auto _ : state) {
+    jacobian->SquaredColumnNorm(x.data(), context, num_threads);
+  }
+  CHECK_GT(x.squaredNorm(), 0.);
+}
+
+static void JacobianScaleColumns(benchmark::State& state,
+                                 BALData* data,
+                                 ContextImpl* context) {
+  const int num_threads = state.range(0);
+
+  auto jacobian_const = data->BlockSparseJacobian(context);
+  auto jacobian = const_cast<BlockSparseMatrix*>(jacobian_const);
+
+  Vector x = Vector::Ones(jacobian->num_cols());
+
+  for (auto _ : state) {
+    jacobian->ScaleColumns(x.data(), context, num_threads);
+  }
+}
+
 static void JacobianRightMultiplyAndAccumulate(benchmark::State& state,
                                                BALData* data,
                                                ContextImpl* context) {
@@ -368,7 +485,7 @@
                                               ContextImpl* context) {
   const int num_threads = state.range(0);
 
-  auto jacobian = data->BlockSparseJacobianWithTranspose(context);
+  auto jacobian = data->BlockSparseJacobian(context);
 
   Vector y = Vector::Zero(jacobian->num_cols());
   Vector x = Vector::Random(jacobian->num_rows());
@@ -483,6 +600,15 @@
         ->Arg(8)
         ->Arg(16);
 
+    const std::string name_plus = "Plus<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_plus.c_str(), ceres::internal::Plus, data, &context)
+        ->Arg(1)
+        ->Arg(2)
+        ->Arg(4)
+        ->Arg(8)
+        ->Arg(16);
+
     const std::string name_right_product =
         "JacobianRightMultiplyAndAccumulate<" + path + ">";
     ::benchmark::RegisterBenchmark(
@@ -534,6 +660,18 @@
         ->Arg(8)
         ->Arg(16);
 
+    const std::string name_isc_no_diag =
+        "ISCRightMultiplyAndAccumulate<" + path + ">";
+    ::benchmark::RegisterBenchmark(name_isc_no_diag.c_str(),
+                                   ceres::internal::ISCRightMultiplyNoDiag,
+                                   data,
+                                   &context)
+        ->Arg(1)
+        ->Arg(2)
+        ->Arg(4)
+        ->Arg(8)
+        ->Arg(16);
+
     const std::string name_update_block_diagonal_ete =
         "PMVUpdateBlockDiagonalEtE<" + path + ">";
     ::benchmark::RegisterBenchmark(name_update_block_diagonal_ete.c_str(),
@@ -545,6 +683,17 @@
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);
+    const std::string name_isc_diag =
+        "ISCRightMultiplyAndAccumulateDiag<" + path + ">";
+    ::benchmark::RegisterBenchmark(name_isc_diag.c_str(),
+                                   ceres::internal::ISCRightMultiplyDiag,
+                                   data,
+                                   &context)
+        ->Arg(1)
+        ->Arg(2)
+        ->Arg(4)
+        ->Arg(8)
+        ->Arg(16);
 
 #ifndef CERES_NO_CUDA
     const std::string name_right_product_cuda =
@@ -606,6 +755,29 @@
         &context)
         ->Arg(1);
 #endif
+
+    const std::string name_squared_column_norm =
+        "JacobianSquaredColumnNorm<" + path + ">";
+    ::benchmark::RegisterBenchmark(name_squared_column_norm.c_str(),
+                                   ceres::internal::JacobianSquaredColumnNorm,
+                                   data,
+                                   &context)
+        ->Arg(1)
+        ->Arg(2)
+        ->Arg(4)
+        ->Arg(8)
+        ->Arg(16);
+
+    const std::string name_scale_columns = "JacobianScaleColumns<" + path + ">";
+    ::benchmark::RegisterBenchmark(name_scale_columns.c_str(),
+                                   ceres::internal::JacobianScaleColumns,
+                                   data,
+                                   &context)
+        ->Arg(1)
+        ->Arg(2)
+        ->Arg(4)
+        ->Arg(8)
+        ->Arg(16);
   }
   ::benchmark::RunSpecifiedBenchmarks();
 
diff --git a/internal/ceres/implicit_schur_complement.cc b/internal/ceres/implicit_schur_complement.cc
index 4e36048..dbba3a7 100644
--- a/internal/ceres/implicit_schur_complement.cc
+++ b/internal/ceres/implicit_schur_complement.cc
@@ -35,6 +35,7 @@
 #include "ceres/block_structure.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/linear_solver.h"
+#include "ceres/parallel_for.h"
 #include "ceres/types.h"
 #include "glog/logging.h"
 
@@ -104,21 +105,22 @@
 void ImplicitSchurComplement::RightMultiplyAndAccumulate(const double* x,
                                                          double* y) const {
   // y1 = F x
-  tmp_rows_.setZero();
+  ParallelSetZero(options_.context, options_.num_threads, tmp_rows_);
   A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());
 
   // y2 = E' y1
-  tmp_e_cols_.setZero();
+  ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_);
   A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());
 
   // y3 = -(E'E)^-1 y2
-  tmp_e_cols_2_.setZero();
+  ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_2_);
   block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
                                                           tmp_e_cols_2_.data(),
                                                           options_.context,
                                                           options_.num_threads);
 
-  tmp_e_cols_2_ *= -1.0;
+  ParallelAssign(
+      options_.context, options_.num_threads, tmp_e_cols_2_, -tmp_e_cols_2_);
 
   // y1 = y1 + E y3
   A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data());
@@ -126,11 +128,14 @@
   // y5 = D * x
   if (D_ != nullptr) {
     ConstVectorRef Dref(D_ + A_->num_cols_e(), num_cols());
-    VectorRef(y, num_cols()) =
-        (Dref.array().square() * ConstVectorRef(x, num_cols()).array())
-            .matrix();
+    VectorRef y_cols(y, num_cols());
+    ParallelAssign(
+        options_.context,
+        options_.num_threads,
+        y_cols,
+        (Dref.array().square() * ConstVectorRef(x, num_cols()).array()));
   } else {
-    VectorRef(y, num_cols()).setZero();
+    ParallelSetZero(options_.context, options_.num_threads, y, num_cols());
   }
 
   // y = y5 + F' y1
@@ -141,25 +146,25 @@
     const double* x, double* y) const {
   CHECK(compute_ftf_inverse_);
   // y1 = F x
-  tmp_rows_.setZero();
+  ParallelSetZero(options_.context, options_.num_threads, tmp_rows_);
   A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());
 
   // y2 = E' y1
-  tmp_e_cols_.setZero();
+  ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_);
   A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());
 
   // y3 = (E'E)^-1 y2
-  tmp_e_cols_2_.setZero();
+  ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_2_);
   block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
                                                           tmp_e_cols_2_.data(),
                                                           options_.context,
                                                           options_.num_threads);
   // y1 = E y3
-  tmp_rows_.setZero();
+  ParallelSetZero(options_.context, options_.num_threads, tmp_rows_);
   A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data());
 
   // y4 = F' y1
-  tmp_f_cols_.setZero();
+  ParallelSetZero(options_.context, options_.num_threads, tmp_f_cols_);
   A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), tmp_f_cols_.data());
 
   // y += (F'F)^-1 y4
@@ -174,22 +179,27 @@
     const double* D, BlockSparseMatrix* block_diagonal) {
   const CompressedRowBlockStructure* block_diagonal_structure =
       block_diagonal->block_structure();
-  for (const auto& row : block_diagonal_structure->rows) {
-    const int row_block_pos = row.block.position;
-    const int row_block_size = row.block.size;
-    const Cell& cell = row.cells[0];
-    MatrixRef m(block_diagonal->mutable_values() + cell.position,
-                row_block_size,
-                row_block_size);
+  ParallelFor(options_.context,
+              0,
+              block_diagonal_structure->rows.size(),
+              options_.num_threads,
+              [block_diagonal_structure, D, block_diagonal](int row_block_id) {
+                auto& row = block_diagonal_structure->rows[row_block_id];
+                const int row_block_pos = row.block.position;
+                const int row_block_size = row.block.size;
+                const Cell& cell = row.cells[0];
+                MatrixRef m(block_diagonal->mutable_values() + cell.position,
+                            row_block_size,
+                            row_block_size);
 
-    if (D != nullptr) {
-      ConstVectorRef d(D + row_block_pos, row_block_size);
-      m += d.array().square().matrix().asDiagonal();
-    }
+                if (D != nullptr) {
+                  ConstVectorRef d(D + row_block_pos, row_block_size);
+                  m += d.array().square().matrix().asDiagonal();
+                }
 
-    m = m.selfadjointView<Eigen::Upper>().llt().solve(
-        Matrix::Identity(row_block_size, row_block_size));
-  }
+                m = m.selfadjointView<Eigen::Upper>().llt().solve(
+                    Matrix::Identity(row_block_size, row_block_size));
+              });
 }
 
 // Similar to RightMultiplyAndAccumulate, use the block structure of the matrix
@@ -201,18 +211,21 @@
   const int num_rows = A_->num_rows();
 
   // y1 = F x
-  tmp_rows_.setZero();
+  ParallelSetZero(options_.context, options_.num_threads, tmp_rows_);
   A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());
 
   // y2 = b - y1
-  tmp_rows_ = ConstVectorRef(b_, num_rows) - tmp_rows_;
+  ParallelAssign(options_.context,
+                 options_.num_threads,
+                 tmp_rows_,
+                 ConstVectorRef(b_, num_rows) - tmp_rows_);
 
   // y3 = E' y2
-  tmp_e_cols_.setZero();
+  ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_);
   A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());
 
   // y = (E'E)^-1 y3
-  VectorRef(y, num_cols).setZero();
+  ParallelSetZero(options_.context, options_.num_threads, y, num_cols);
   block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(
       tmp_e_cols_.data(), y, options_.context, options_.num_threads);
 
@@ -221,7 +234,11 @@
   // computed via back substitution. The second block of variables
   // corresponds to the Schur complement system, so we just copy those
   // values from the solution to the Schur complement.
-  VectorRef(y + num_cols_e, num_cols_f) = ConstVectorRef(x, num_cols_f);
+  VectorRef y_cols_f(y + num_cols_e, num_cols_f);
+  ParallelAssign(options_.context,
+                 options_.num_threads,
+                 y_cols_f,
+                 ConstVectorRef(x, num_cols_f));
 }
 
 // Compute the RHS of the Schur complement system.
@@ -232,23 +249,28 @@
 // this using a series of matrix vector products.
 void ImplicitSchurComplement::UpdateRhs() {
   // y1 = E'b
-  tmp_e_cols_.setZero();
+  ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_);
   A_->LeftMultiplyAndAccumulateE(b_, tmp_e_cols_.data());
 
   // y2 = (E'E)^-1 y1
-  Vector y2 = Vector::Zero(A_->num_cols_e());
-  block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(
-      tmp_e_cols_.data(), y2.data(), options_.context, options_.num_threads);
+  ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_2_);
+  block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
+                                                          tmp_e_cols_2_.data(),
+                                                          options_.context,
+                                                          options_.num_threads);
 
   // y3 = E y2
-  tmp_rows_.setZero();
-  A_->RightMultiplyAndAccumulateE(y2.data(), tmp_rows_.data());
+  ParallelSetZero(options_.context, options_.num_threads, tmp_rows_);
+  A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data());
 
   // y3 = b - y3
-  tmp_rows_ = ConstVectorRef(b_, A_->num_rows()) - tmp_rows_;
+  ParallelAssign(options_.context,
+                 options_.num_threads,
+                 tmp_rows_,
+                 ConstVectorRef(b_, A_->num_rows()) - tmp_rows_);
 
   // rhs = F' y3
-  rhs_.setZero();
+  ParallelSetZero(options_.context, options_.num_threads, rhs_);
   A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), rhs_.data());
 }
 
diff --git a/internal/ceres/iterative_schur_complement_solver.cc b/internal/ceres/iterative_schur_complement_solver.cc
index 589a1d2..16b20a7 100644
--- a/internal/ceres/iterative_schur_complement_solver.cc
+++ b/internal/ceres/iterative_schur_complement_solver.cc
@@ -130,7 +130,7 @@
 
   Vector scratch[4];
   for (int i = 0; i < 4; ++i) {
-    scratch[i] = Vector::Zero(schur_complement_->num_cols());
+    scratch[i].resize(schur_complement_->num_cols());
   }
   Vector* scratch_ptr[4] = {&scratch[0], &scratch[1], &scratch[2], &scratch[3]};
 
@@ -180,15 +180,23 @@
       break;
     case JACOBI:
       preconditioner_ = std::make_unique<SparseMatrixPreconditionerWrapper>(
-          schur_complement_->block_diagonal_FtF_inverse());
+          schur_complement_->block_diagonal_FtF_inverse(),
+          preconditioner_options);
       break;
     case SCHUR_POWER_SERIES_EXPANSION:
       // Ignoring the value of spse_tolerance to ensure preconditioner stays
       // fixed during the iterations of cg.
+      // TODO: In PowerSeriesExpansionPreconditioner::RightMultiplyAndAccumulate
+      // only operations performed via ImplicitSchurComplement are threaded.
+      // PowerSeriesExpansionPreconditioner will benefit from multithreading
+      // applied to remaning operations (block-sparse right product and several
+      // vector operations)
       preconditioner_ = std::make_unique<PowerSeriesExpansionPreconditioner>(
           schur_complement_.get(), options_.max_num_spse_iterations, 0);
       break;
     case SCHUR_JACOBI:
+      // TODO: SchurJacobiPreconditioner::RightMultiply will benefit from
+      // multithreading
       preconditioner_ = std::make_unique<SchurJacobiPreconditioner>(
           *A->block_structure(), preconditioner_options);
       break;
diff --git a/internal/ceres/levenberg_marquardt_strategy.cc b/internal/ceres/levenberg_marquardt_strategy.cc
index b32e131..76e73b4 100644
--- a/internal/ceres/levenberg_marquardt_strategy.cc
+++ b/internal/ceres/levenberg_marquardt_strategy.cc
@@ -38,6 +38,7 @@
 #include "ceres/internal/eigen.h"
 #include "ceres/linear_least_squares_problems.h"
 #include "ceres/linear_solver.h"
+#include "ceres/parallel_for.h"
 #include "ceres/sparse_matrix.h"
 #include "ceres/trust_region_strategy.h"
 #include "ceres/types.h"
@@ -53,7 +54,9 @@
       min_diagonal_(options.min_lm_diagonal),
       max_diagonal_(options.max_lm_diagonal),
       decrease_factor_(2.0),
-      reuse_diagonal_(false) {
+      reuse_diagonal_(false),
+      context_(options.context),
+      num_threads_(options.num_threads) {
   CHECK(linear_solver_ != nullptr);
   CHECK_GT(min_diagonal_, 0.0);
   CHECK_LE(min_diagonal_, max_diagonal_);
@@ -77,14 +80,18 @@
       diagonal_.resize(num_parameters, 1);
     }
 
-    jacobian->SquaredColumnNorm(diagonal_.data());
-    for (int i = 0; i < num_parameters; ++i) {
-      diagonal_[i] =
-          std::min(std::max(diagonal_[i], min_diagonal_), max_diagonal_);
-    }
+    jacobian->SquaredColumnNorm(diagonal_.data(), context_, num_threads_);
+    ParallelAssign(context_,
+                   num_threads_,
+                   diagonal_,
+                   diagonal_.array().max(min_diagonal_).min(max_diagonal_));
   }
 
-  lm_diagonal_ = (diagonal_ / radius_).array().sqrt();
+  if (lm_diagonal_.size() == 0) {
+    lm_diagonal_.resize(num_parameters);
+  }
+  ParallelAssign(
+      context_, num_threads_, lm_diagonal_, (diagonal_ / radius_).cwiseSqrt());
 
   LinearSolver::PerSolveOptions solve_options;
   solve_options.D = lm_diagonal_.data();
@@ -98,7 +105,7 @@
   // Invalidate the output array lm_step, so that we can detect if
   // the linear solver generated numerical garbage.  This is known
   // to happen for the DENSE_QR and then DENSE_SCHUR solver when
-  // the Jacobin is severely rank deficient and mu is too small.
+  // the Jacobian is severely rank deficient and mu is too small.
   InvalidateArray(num_parameters, step);
 
   // Instead of solving Jx = -r, solve Jy = r.
@@ -120,7 +127,8 @@
     linear_solver_summary.termination_type =
         LinearSolverTerminationType::FAILURE;
   } else {
-    VectorRef(step, num_parameters) *= -1.0;
+    VectorRef step_vec(step, num_parameters);
+    ParallelAssign(context_, num_threads_, step_vec, -step_vec);
   }
   reuse_diagonal_ = true;
 
diff --git a/internal/ceres/levenberg_marquardt_strategy.h b/internal/ceres/levenberg_marquardt_strategy.h
index 2b014a6..bc6aae3 100644
--- a/internal/ceres/levenberg_marquardt_strategy.h
+++ b/internal/ceres/levenberg_marquardt_strategy.h
@@ -38,6 +38,8 @@
 
 namespace ceres::internal {
 
+class ContextImpl;
+
 // Levenberg-Marquardt step computation and trust region sizing
 // strategy based on on "Methods for Nonlinear Least Squares" by
 // K. Madsen, H.B. Nielsen and O. Tingleff. Available to download from
@@ -81,6 +83,8 @@
   // allocations in every iteration and reuse when a step fails and
   // ComputeStep is called again.
   Vector lm_diagonal_;  // lm_diagonal_ = sqrt(diagonal_ / radius_);
+  ContextImpl* context_;
+  int num_threads_;
 };
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/minimizer.h b/internal/ceres/minimizer.h
index d4ba708..b7a4df7 100644
--- a/internal/ceres/minimizer.h
+++ b/internal/ceres/minimizer.h
@@ -47,6 +47,7 @@
 class TrustRegionStrategy;
 class CoordinateDescentMinimizer;
 class LinearSolver;
+class ContextImpl;
 
 // Interface for non-linear least squares solvers.
 class CERES_NO_EXPORT Minimizer {
@@ -113,6 +114,7 @@
     int max_num_iterations;
     double max_solver_time_in_seconds;
     int num_threads;
+    ContextImpl* context = nullptr;
 
     // Number of times the linear solver should be retried in case of
     // numerical failure. The retries are done by exponentially scaling up
diff --git a/internal/ceres/parallel_for.h b/internal/ceres/parallel_for.h
index d91a195..26a4a79 100644
--- a/internal/ceres/parallel_for.h
+++ b/internal/ceres/parallel_for.h
@@ -38,6 +38,7 @@
 
 #include "ceres/context_impl.h"
 #include "ceres/internal/disable_warnings.h"
+#include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"
 #include "glog/logging.h"
 
@@ -61,7 +62,7 @@
 namespace parallel_for_details {
 // Get arguments of callable as a tuple
 template <typename F, typename... Args>
-std::tuple<Args...> args_of(void (F::*)(Args...) const);
+std::tuple<std::decay_t<Args>...> args_of(void (F::*)(Args...) const);
 
 template <typename F>
 using args_of_t = decltype(args_of(&F::operator()));
@@ -75,24 +76,58 @@
 // For parallel for iterations of type [](int i) -> void
 template <typename F>
 struct InvokeImpl<F, std::tuple<int>> {
-  static void Invoke(int thread_id, int i, const F& function) {
+  static void InvokeOnSegment(int thread_id,
+                              int start,
+                              int end,
+                              const F& function) {
     (void)thread_id;
-    function(i);
+    for (int i = start; i < end; ++i) {
+      function(i);
+    }
   }
 };
 
 // For parallel for iterations of type [](int thread_id, int i) -> void
 template <typename F>
 struct InvokeImpl<F, std::tuple<int, int>> {
-  static void Invoke(int thread_id, int i, const F& function) {
-    function(thread_id, i);
+  static void InvokeOnSegment(int thread_id,
+                              int start,
+                              int end,
+                              const F& function) {
+    for (int i = start; i < end; ++i) {
+      function(thread_id, i);
+    }
+  }
+};
+
+// For parallel for iterations of type [](tuple<int, int> range) -> void
+template <typename F>
+struct InvokeImpl<F, std::tuple<std::tuple<int, int>>> {
+  static void InvokeOnSegment(int thread_id,
+                              int start,
+                              int end,
+                              const F& function) {
+    (void)thread_id;
+    function(std::make_tuple(start, end));
+  }
+};
+
+// For parallel for iterations of type [](int thread_id, tuple<int, int> range)
+// -> void
+template <typename F>
+struct InvokeImpl<F, std::tuple<int, std::tuple<int, int>>> {
+  static void InvokeOnSegment(int thread_id,
+                              int start,
+                              int end,
+                              const F& function) {
+    function(thread_id, std::make_tuple(start, end));
   }
 };
 
 // Invoke function passing thread_id only if required
 template <typename F>
-void Invoke(int thread_id, int i, const F& function) {
-  InvokeImpl<F, args_of_t<F>>::Invoke(thread_id, i, function);
+void InvokeOnSegment(int thread_id, int start, int end, const F& function) {
+  InvokeImpl<F, args_of_t<F>>::InvokeOnSegment(thread_id, start, end, function);
 }
 
 // Check if it is possible to split range [start; end) into at most
@@ -218,7 +253,8 @@
 // implemented by each threading backend
 template <typename F>
 void ParallelInvoke(ContextImpl* context,
-                    int i,
+                    int start,
+                    int end,
                     int num_threads,
                     const F& function);
 
@@ -241,9 +277,7 @@
   }
 
   if (num_threads == 1 || end - start == 1) {
-    for (int i = start; i < end; ++i) {
-      Invoke<F>(0, i, function);
-    }
+    InvokeOnSegment<F>(0, start, end, function);
     return;
   }
 
@@ -293,9 +327,8 @@
                 const int partition_start = partitions[partition_id];
                 const int partition_end = partitions[partition_id + 1];
 
-                for (int i = partition_start; i < partition_end; ++i) {
-                  Invoke<F>(thread_id, i, function);
-                }
+                InvokeOnSegment<F>(
+                    thread_id, partition_start, partition_end, function);
               });
 }
 
@@ -330,11 +363,50 @@
                 const int partition_start = partitions[partition_id];
                 const int partition_end = partitions[partition_id + 1];
 
-                for (int i = partition_start; i < partition_end; ++i) {
-                  Invoke<F>(thread_id, i, function);
-                }
+                InvokeOnSegment<F>(
+                    thread_id, partition_start, partition_end, function);
               });
 }
+
+// Evaluate vector expression in parallel
+// Assuming LhsExpression and RhsExpression are some sort of
+// column-vector expression, assignment lhs = rhs
+// is eavluated over a set of contiguous blocks in parallel.
+// This is expected to work well in the case of vector-based
+// expressions (since they typically do not result into
+// temporaries).
+// This method expects lhs to be size-compatible with rhs
+template <typename LhsExpression, typename RhsExpression>
+void ParallelAssign(ContextImpl* context,
+                    int num_threads,
+                    LhsExpression& lhs,
+                    const RhsExpression& rhs) {
+  static_assert(LhsExpression::ColsAtCompileTime == 1);
+  static_assert(RhsExpression::ColsAtCompileTime == 1);
+  CHECK_EQ(lhs.rows(), rhs.rows());
+  const int num_rows = lhs.rows();
+  ParallelFor(context,
+              0,
+              num_rows,
+              num_threads,
+              [&lhs, &rhs](const std::tuple<int, int>& range) {
+                auto [start, end] = range;
+                lhs.segment(start, end - start) =
+                    rhs.segment(start, end - start);
+              });
+}
+
+// Set vector to zero using num_threads
+template <typename VectorType>
+void ParallelSetZero(ContextImpl* context,
+                     int num_threads,
+                     VectorType& vector) {
+  ParallelSetZero(context, num_threads, vector.data(), vector.rows());
+}
+void ParallelSetZero(ContextImpl* context,
+                     int num_threads,
+                     double* values,
+                     int num_values);
 }  // namespace ceres::internal
 
 // Backend-specific implementations of ParallelInvoke
diff --git a/internal/ceres/parallel_for_cxx.cc b/internal/ceres/parallel_for_cxx.cc
index cdf9966..3f47436 100644
--- a/internal/ceres/parallel_for_cxx.cc
+++ b/internal/ceres/parallel_for_cxx.cc
@@ -74,4 +74,18 @@
 
 int MaxNumThreadsAvailable() { return ThreadPool::MaxNumThreadsAvailable(); }
 
+void ParallelSetZero(ContextImpl* context,
+                     int num_threads,
+                     double* values,
+                     int num_values) {
+  ParallelFor(context,
+              0,
+              num_values,
+              num_threads,
+              [values](std::tuple<int, int> range) {
+                auto [start, end] = range;
+                std::fill(values + start, values + end, 0.);
+              });
+}
+
 }  // namespace ceres::internal
diff --git a/internal/ceres/parallel_for_cxx.h b/internal/ceres/parallel_for_cxx.h
index 0731609..c33ae08 100644
--- a/internal/ceres/parallel_for_cxx.h
+++ b/internal/ceres/parallel_for_cxx.h
@@ -213,9 +213,7 @@
       const int curr_end = curr_start + base_block_size +
                            (block_id < num_base_p1_sized_blocks ? 1 : 0);
       // Perform each task in current block
-      for (int i = curr_start; i < curr_end; ++i) {
-        Invoke<F>(thread_id, i, function);
-      }
+      InvokeOnSegment<F>(thread_id, curr_start, curr_end, function);
     }
     shared_state->block_until_finished.Finished(num_jobs_finished);
   };
diff --git a/internal/ceres/parallel_for_test.cc b/internal/ceres/parallel_for_test.cc
index 6974ab9..058b0a3 100644
--- a/internal/ceres/parallel_for_test.cc
+++ b/internal/ceres/parallel_for_test.cc
@@ -70,6 +70,28 @@
   }
 }
 
+// Tests parallel for loop with ranges
+TEST(ParallelForWithRange, NumThreads) {
+  ContextImpl context;
+  context.EnsureMinimumThreads(/*num_threads=*/2);
+
+  const int size = 16;
+  std::vector<int> expected_results(size, 0);
+  for (int i = 0; i < size; ++i) {
+    expected_results[i] = std::sqrt(i);
+  }
+
+  for (int num_threads = 1; num_threads <= 8; ++num_threads) {
+    std::vector<int> values(size, 0);
+    ParallelFor(
+        &context, 0, size, num_threads, [&values](std::tuple<int, int> range) {
+          auto [start, end] = range;
+          for (int i = start; i < end; ++i) values[i] = std::sqrt(i);
+        });
+    EXPECT_THAT(values, ElementsAreArray(expected_results));
+  }
+}
+
 // Tests the parallel for loop with the thread ID interface computes the correct
 // result for various number of threads.
 TEST(ParallelForWithThreadId, NumThreads) {
@@ -410,4 +432,40 @@
   }
 }
 
+TEST(ParallelAssign, D2MulX) {
+  const int kVectorSize = 1024 * 1024;
+  const int kMaxNumThreads = 8;
+
+  const Vector D_full = Vector::Random(kVectorSize * 2);
+  const ConstVectorRef D(D_full.data() + kVectorSize, kVectorSize);
+  const Vector x = Vector::Random(kVectorSize);
+  const Vector y_expected = D.array().square() * x.array();
+  ContextImpl context;
+  context.EnsureMinimumThreads(kMaxNumThreads);
+
+  for (int num_threads = 1; num_threads <= kMaxNumThreads; ++num_threads) {
+    Vector y_observed(kVectorSize);
+    ParallelAssign(
+        &context, num_threads, y_observed, D.array().square() * x.array());
+
+    // Bit-exact result is expected
+    CHECK_EQ((y_expected - y_observed).squaredNorm(), 0.);
+  }
+}
+
+TEST(ParallelAssign, SetZero) {
+  const int kVectorSize = 1024 * 1024;
+  const int kMaxNumThreads = 8;
+
+  ContextImpl context;
+  context.EnsureMinimumThreads(kMaxNumThreads);
+
+  for (int num_threads = 1; num_threads <= kMaxNumThreads; ++num_threads) {
+    Vector x = Vector::Random(kVectorSize);
+    ParallelSetZero(&context, num_threads, x);
+
+    CHECK_EQ(x.squaredNorm(), 0.);
+  }
+}
+
 }  // namespace ceres::internal
diff --git a/internal/ceres/parallel_vector_operations_benchmark.cc b/internal/ceres/parallel_vector_operations_benchmark.cc
new file mode 100644
index 0000000..7180b54
--- /dev/null
+++ b/internal/ceres/parallel_vector_operations_benchmark.cc
@@ -0,0 +1,282 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+#include "benchmark/benchmark.h"
+#include "ceres/eigen_vector_ops.h"
+#include "ceres/parallel_for.h"
+
+namespace ceres::internal {
+
+const int kVectorSize = 64 * 1024 * 1024 / sizeof(double);
+
+static void SetZero(benchmark::State& state) {
+  Vector x = Vector::Random(kVectorSize);
+  for (auto _ : state) {
+    x.setZero();
+  }
+  CHECK_EQ(x.squaredNorm(), 0.);
+}
+BENCHMARK(SetZero);
+
+static void SetZeroParallel(benchmark::State& state) {
+  const int num_threads = state.range(0);
+  ContextImpl context;
+  context.EnsureMinimumThreads(num_threads);
+
+  Vector x = Vector::Random(kVectorSize);
+  for (auto _ : state) {
+    ParallelSetZero(&context, num_threads, x);
+  }
+  CHECK_EQ(x.squaredNorm(), 0.);
+}
+BENCHMARK(SetZeroParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+static void Negate(benchmark::State& state) {
+  Vector x = Vector::Random(kVectorSize).normalized();
+  const Vector x_init = x;
+
+  for (auto _ : state) {
+    x = -x;
+  }
+  CHECK((x - x_init).squaredNorm() == 0. || (x + x_init).squaredNorm() == 0);
+}
+BENCHMARK(Negate);
+
+static void NegateParallel(benchmark::State& state) {
+  const int num_threads = state.range(0);
+  ContextImpl context;
+  context.EnsureMinimumThreads(num_threads);
+
+  Vector x = Vector::Random(kVectorSize).normalized();
+  const Vector x_init = x;
+
+  for (auto _ : state) {
+    ParallelAssign(&context, num_threads, x, -x);
+  }
+  CHECK((x - x_init).squaredNorm() == 0. || (x + x_init).squaredNorm() == 0);
+}
+BENCHMARK(NegateParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+static void Assign(benchmark::State& state) {
+  Vector x = Vector::Random(kVectorSize);
+  Vector y = Vector(kVectorSize);
+  for (auto _ : state) {
+    y.block(0, 0, kVectorSize, 1) = x.block(0, 0, kVectorSize, 1);
+  }
+  CHECK_EQ((y - x).squaredNorm(), 0.);
+}
+BENCHMARK(Assign);
+
+static void AssignParallel(benchmark::State& state) {
+  const int num_threads = state.range(0);
+  ContextImpl context;
+  context.EnsureMinimumThreads(num_threads);
+
+  Vector x = Vector::Random(kVectorSize);
+  Vector y = Vector(kVectorSize);
+
+  for (auto _ : state) {
+    ParallelAssign(&context, num_threads, y, x);
+  }
+  CHECK_EQ((y - x).squaredNorm(), 0.);
+}
+BENCHMARK(AssignParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+static void D2X(benchmark::State& state) {
+  const Vector x = Vector::Random(kVectorSize);
+  const Vector D = Vector::Random(kVectorSize);
+  Vector y = Vector::Zero(kVectorSize);
+  for (auto _ : state) {
+    y = D.array().square() * x.array();
+  }
+  CHECK_GT(y.squaredNorm(), 0.);
+}
+BENCHMARK(D2X);
+
+static void D2XParallel(benchmark::State& state) {
+  const int num_threads = state.range(0);
+  ContextImpl context;
+  context.EnsureMinimumThreads(num_threads);
+
+  const Vector x = Vector::Random(kVectorSize);
+  const Vector D = Vector::Random(kVectorSize);
+  Vector y = Vector(kVectorSize);
+
+  for (auto _ : state) {
+    ParallelAssign(&context, num_threads, y, D.array().square() * x.array());
+  }
+  CHECK_GT(y.squaredNorm(), 0.);
+}
+BENCHMARK(D2XParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+static void DivideSqrt(benchmark::State& state) {
+  Vector diagonal = Vector::Random(kVectorSize).array().abs();
+  const double radius = 0.5;
+  for (auto _ : state) {
+    diagonal = (diagonal / radius).array().sqrt();
+  }
+  CHECK_GT(diagonal.squaredNorm(), 0.);
+}
+BENCHMARK(DivideSqrt);
+
+static void DivideSqrtParallel(benchmark::State& state) {
+  const int num_threads = state.range(0);
+  ContextImpl context;
+  context.EnsureMinimumThreads(num_threads);
+
+  Vector diagonal = Vector::Random(kVectorSize).array().abs();
+  const double radius = 0.5;
+  for (auto _ : state) {
+    ParallelAssign(
+        &context, num_threads, diagonal, (diagonal / radius).cwiseSqrt());
+  }
+  CHECK_GT(diagonal.squaredNorm(), 0.);
+}
+BENCHMARK(DivideSqrtParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+static void Clamp(benchmark::State& state) {
+  Vector diagonal = Vector::Random(kVectorSize);
+  const double min = -0.5;
+  const double max = 0.5;
+  for (auto _ : state) {
+    for (int i = 0; i < kVectorSize; ++i) {
+      diagonal[i] = std::min(std::max(diagonal[i], min), max);
+    }
+  }
+  CHECK_LE(diagonal.maxCoeff(), 0.5);
+  CHECK_GE(diagonal.minCoeff(), -0.5);
+}
+BENCHMARK(Clamp);
+
+static void ClampParallel(benchmark::State& state) {
+  const int num_threads = state.range(0);
+  ContextImpl context;
+  context.EnsureMinimumThreads(num_threads);
+
+  Vector diagonal = Vector::Random(kVectorSize);
+  const double min = -0.5;
+  const double max = 0.5;
+  for (auto _ : state) {
+    ParallelAssign(
+        &context, num_threads, diagonal, diagonal.array().max(min).min(max));
+  }
+  CHECK_LE(diagonal.maxCoeff(), 0.5);
+  CHECK_GE(diagonal.minCoeff(), -0.5);
+}
+BENCHMARK(ClampParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+static void Norm(benchmark::State& state) {
+  const Vector x = Vector::Random(kVectorSize);
+
+  double total = 0.;
+  for (auto _ : state) {
+    total += x.norm();
+  }
+  CHECK_GT(total, 0.);
+}
+BENCHMARK(Norm);
+
+static void NormParallel(benchmark::State& state) {
+  const int num_threads = state.range(0);
+  ContextImpl context;
+  context.EnsureMinimumThreads(num_threads);
+
+  const Vector x = Vector::Random(kVectorSize);
+
+  double total = 0.;
+  for (auto _ : state) {
+    total += Norm(x, &context, num_threads);
+  }
+  CHECK_GT(total, 0.);
+}
+BENCHMARK(NormParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+static void Dot(benchmark::State& state) {
+  const Vector x = Vector::Random(kVectorSize);
+  const Vector y = Vector::Random(kVectorSize);
+
+  double total = 0.;
+  for (auto _ : state) {
+    total += x.dot(y);
+  }
+  CHECK_NE(total, 0.);
+}
+BENCHMARK(Dot);
+
+static void DotParallel(benchmark::State& state) {
+  const int num_threads = state.range(0);
+  ContextImpl context;
+  context.EnsureMinimumThreads(num_threads);
+
+  const Vector x = Vector::Random(kVectorSize);
+  const Vector y = Vector::Random(kVectorSize);
+
+  double total = 0.;
+  for (auto _ : state) {
+    total += Dot(x, y, &context, num_threads);
+  }
+  CHECK_NE(total, 0.);
+}
+BENCHMARK(DotParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+static void Axpby(benchmark::State& state) {
+  const Vector x = Vector::Random(kVectorSize);
+  const Vector y = Vector::Random(kVectorSize);
+  Vector z = Vector::Zero(kVectorSize);
+  const double a = 3.1415;
+  const double b = 1.2345;
+
+  double total = 0.;
+  for (auto _ : state) {
+    z = a * x + b * y;
+  }
+  CHECK_GT(z.squaredNorm(), 0.);
+}
+BENCHMARK(Axpby);
+
+static void AxpbyParallel(benchmark::State& state) {
+  const int num_threads = state.range(0);
+  ContextImpl context;
+  context.EnsureMinimumThreads(num_threads);
+
+  const Vector x = Vector::Random(kVectorSize);
+  const Vector y = Vector::Random(kVectorSize);
+  Vector z = Vector::Zero(kVectorSize);
+  const double a = 3.1415;
+  const double b = 1.2345;
+
+  double total = 0.;
+  for (auto _ : state) {
+    Axpby(a, x, b, y, z, &context, num_threads);
+  }
+  CHECK_GT(z.squaredNorm(), 0.);
+}
+BENCHMARK(AxpbyParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+}  // namespace ceres::internal
+
+BENCHMARK_MAIN();
diff --git a/internal/ceres/partitioned_matrix_view_test.cc b/internal/ceres/partitioned_matrix_view_test.cc
index 2ff99bf..f9fee84 100644
--- a/internal/ceres/partitioned_matrix_view_test.cc
+++ b/internal/ceres/partitioned_matrix_view_test.cc
@@ -68,7 +68,6 @@
     CHECK(problem != nullptr);
     A_ = std::move(problem->A);
     auto block_sparse = down_cast<BlockSparseMatrix*>(A_.get());
-    block_sparse->AddTransposeBlockStructure();
 
     options_.num_threads = num_threads;
     options_.context = &context_;
diff --git a/internal/ceres/preconditioner.cc b/internal/ceres/preconditioner.cc
index 33174de..3e2bf41 100644
--- a/internal/ceres/preconditioner.cc
+++ b/internal/ceres/preconditioner.cc
@@ -47,8 +47,8 @@
 }
 
 SparseMatrixPreconditionerWrapper::SparseMatrixPreconditionerWrapper(
-    const SparseMatrix* matrix)
-    : matrix_(matrix) {
+    const SparseMatrix* matrix, const Preconditioner::Options& options)
+    : matrix_(matrix), options_(options) {
   CHECK(matrix != nullptr);
 }
 
@@ -62,7 +62,8 @@
 
 void SparseMatrixPreconditionerWrapper::RightMultiplyAndAccumulate(
     const double* x, double* y) const {
-  matrix_->RightMultiplyAndAccumulate(x, y);
+  matrix_->RightMultiplyAndAccumulate(
+      x, y, options_.context, options_.num_threads);
 }
 
 int SparseMatrixPreconditionerWrapper::num_rows() const {
diff --git a/internal/ceres/preconditioner.h b/internal/ceres/preconditioner.h
index 2d343bd..654ef60 100644
--- a/internal/ceres/preconditioner.h
+++ b/internal/ceres/preconditioner.h
@@ -186,7 +186,8 @@
     : public SparseMatrixPreconditioner {
  public:
   // Wrapper does NOT take ownership of the matrix pointer.
-  explicit SparseMatrixPreconditionerWrapper(const SparseMatrix* matrix);
+  explicit SparseMatrixPreconditionerWrapper(
+      const SparseMatrix* matrix, const Preconditioner::Options& options);
   ~SparseMatrixPreconditionerWrapper() override;
 
   // Preconditioner interface
@@ -196,6 +197,7 @@
  private:
   bool UpdateImpl(const SparseMatrix& A, const double* D) override;
   const SparseMatrix* matrix_;
+  const Preconditioner::Options options_;
 };
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/preprocessor.cc b/internal/ceres/preprocessor.cc
index 9d60ef9..286dfda 100644
--- a/internal/ceres/preprocessor.cc
+++ b/internal/ceres/preprocessor.cc
@@ -82,9 +82,11 @@
   double* reduced_parameters = pp->reduced_parameters.data();
   program->ParameterBlocksToStateVector(reduced_parameters);
 
+  auto context = pp->problem->context();
   Minimizer::Options& minimizer_options = pp->minimizer_options;
   minimizer_options = Minimizer::Options(options);
   minimizer_options.evaluator = pp->evaluator;
+  minimizer_options.context = context;
 
   if (options.logging_type != SILENT) {
     pp->logging_callback = std::make_unique<LoggingCallback>(
diff --git a/internal/ceres/program.cc b/internal/ceres/program.cc
index 9a4ecf3..7549744 100644
--- a/internal/ceres/program.cc
+++ b/internal/ceres/program.cc
@@ -45,6 +45,7 @@
 #include "ceres/loss_function.h"
 #include "ceres/manifold.h"
 #include "ceres/map_util.h"
+#include "ceres/parallel_for.h"
 #include "ceres/parameter_block.h"
 #include "ceres/problem.h"
 #include "ceres/residual_block.h"
@@ -108,16 +109,32 @@
 
 bool Program::Plus(const double* state,
                    const double* delta,
-                   double* state_plus_delta) const {
-  for (auto* parameter_block : parameter_blocks_) {
-    if (!parameter_block->Plus(state, delta, state_plus_delta)) {
-      return false;
-    }
-    state += parameter_block->Size();
-    delta += parameter_block->TangentSize();
-    state_plus_delta += parameter_block->Size();
-  }
-  return true;
+                   double* state_plus_delta,
+                   ContextImpl* context,
+                   int num_threads) const {
+  std::atomic<bool> abort(false);
+  auto* parameter_blocks = parameter_blocks_.data();
+  ParallelFor(
+      context,
+      0,
+      parameter_blocks_.size(),
+      num_threads,
+      [&abort, state, delta, state_plus_delta, parameter_blocks](int block_id) {
+        if (abort) {
+          return;
+        }
+        auto parameter_block = parameter_blocks[block_id];
+
+        auto block_state = state + parameter_block->state_offset();
+        auto block_delta = delta + parameter_block->delta_offset();
+        auto block_state_plus_delta =
+            state_plus_delta + parameter_block->state_offset();
+        if (!parameter_block->Plus(
+                block_state, block_delta, block_state_plus_delta)) {
+          abort = true;
+        }
+      });
+  return abort == false;
 }
 
 void Program::SetParameterOffsetsAndIndex() {
diff --git a/internal/ceres/program.h b/internal/ceres/program.h
index cb923ae..7c0ceee 100644
--- a/internal/ceres/program.h
+++ b/internal/ceres/program.h
@@ -46,6 +46,7 @@
 class ProblemImpl;
 class ResidualBlock;
 class TripletSparseMatrix;
+class ContextImpl;
 
 // A nonlinear least squares optimization problem. This is different from the
 // similarly-named "Problem" object, which offers a mutation interface for
@@ -86,7 +87,9 @@
   // Update a state vector for the program given a delta.
   bool Plus(const double* state,
             const double* delta,
-            double* state_plus_delta) const;
+            double* state_plus_delta,
+            ContextImpl* context,
+            int num_threads) const;
 
   // Set the parameter indices and offsets. This permits mapping backward
   // from a ParameterBlock* to an index in the parameter_blocks() vector. For
diff --git a/internal/ceres/program_evaluator.h b/internal/ceres/program_evaluator.h
index 251a177..cc4da9f 100644
--- a/internal/ceres/program_evaluator.h
+++ b/internal/ceres/program_evaluator.h
@@ -118,7 +118,8 @@
         program_(program),
         jacobian_writer_(options, program),
         evaluate_preparers_(std::move(
-            jacobian_writer_.CreateEvaluatePreparers(options.num_threads))) {
+            jacobian_writer_.CreateEvaluatePreparers(options.num_threads))),
+        num_parameters_(program->NumEffectiveParameters()) {
     BuildResidualLayout(*program, &residual_layout_);
     evaluate_scratch_ = std::move(CreateEvaluatorScratch(
         *program, static_cast<unsigned>(options.num_threads)));
@@ -155,20 +156,24 @@
     }
 
     if (residuals != nullptr) {
-      VectorRef(residuals, program_->NumResiduals()).setZero();
+      ParallelSetZero(options_.context,
+                      options_.num_threads,
+                      residuals,
+                      program_->NumResiduals());
     }
 
     if (jacobian != nullptr) {
-      jacobian->SetZero();
+      jacobian->SetZero(options_.context, options_.num_threads);
     }
 
     // Each thread gets it's own cost and evaluate scratch space.
     for (int i = 0; i < options_.num_threads; ++i) {
       evaluate_scratch_[i].cost = 0.0;
       if (gradient != nullptr) {
-        VectorRef(evaluate_scratch_[i].gradient.get(),
-                  program_->NumEffectiveParameters())
-            .setZero();
+        ParallelSetZero(options_.context,
+                        options_.num_threads,
+                        evaluate_scratch_[i].gradient.get(),
+                        num_parameters_);
       }
     }
 
@@ -251,18 +256,23 @@
         });
 
     if (!abort) {
-      const int num_parameters = program_->NumEffectiveParameters();
-
       // Sum the cost and gradient (if requested) from each thread.
       (*cost) = 0.0;
       if (gradient != nullptr) {
-        VectorRef(gradient, num_parameters).setZero();
+        auto gradient_vector = VectorRef(gradient, num_parameters_);
+        ParallelSetZero(
+            options_.context, options_.num_threads, gradient_vector);
       }
       for (int i = 0; i < options_.num_threads; ++i) {
         (*cost) += evaluate_scratch_[i].cost;
         if (gradient != nullptr) {
-          VectorRef(gradient, num_parameters) +=
-              VectorRef(evaluate_scratch_[i].gradient.get(), num_parameters);
+          auto gradient_vector = VectorRef(gradient, num_parameters_);
+          ParallelAssign(
+              options_.context,
+              options_.num_threads,
+              gradient_vector,
+              gradient_vector + VectorRef(evaluate_scratch_[i].gradient.get(),
+                                          num_parameters_));
         }
       }
 
@@ -272,7 +282,7 @@
       // necessary.
       if (jacobian != nullptr) {
         JacobianFinalizer f;
-        f(jacobian, num_parameters);
+        f(jacobian, num_parameters_);
       }
     }
     return !abort;
@@ -281,7 +291,8 @@
   bool Plus(const double* state,
             const double* delta,
             double* state_plus_delta) const final {
-    return program_->Plus(state, delta, state_plus_delta);
+    return program_->Plus(
+        state, delta, state_plus_delta, options_.context, options_.num_threads);
   }
 
   int NumParameters() const final { return program_->NumParameters(); }
@@ -361,6 +372,7 @@
   std::unique_ptr<EvaluatePreparer[]> evaluate_preparers_;
   std::unique_ptr<EvaluateScratch[]> evaluate_scratch_;
   std::vector<int> residual_layout_;
+  int num_parameters_;
   ::ceres::internal::ExecutionSummary execution_summary_;
 };
 
diff --git a/internal/ceres/sparse_matrix.cc b/internal/ceres/sparse_matrix.cc
index 40899bb..584ac1d 100644
--- a/internal/ceres/sparse_matrix.cc
+++ b/internal/ceres/sparse_matrix.cc
@@ -34,4 +34,20 @@
 
 SparseMatrix::~SparseMatrix() = default;
 
+void SparseMatrix::SquaredColumnNorm(double* x,
+                                     ContextImpl* context,
+                                     int num_threads) const {
+  (void)context;
+  (void)num_threads;
+  SquaredColumnNorm(x);
+}
+
+void SparseMatrix::ScaleColumns(const double* x,
+                                ContextImpl* context,
+                                int num_threads) {
+  (void)context;
+  (void)num_threads;
+  ScaleColumns(x);
+}
+
 }  // namespace ceres::internal
diff --git a/internal/ceres/sparse_matrix.h b/internal/ceres/sparse_matrix.h
index deab7b1..6cc421b 100644
--- a/internal/ceres/sparse_matrix.h
+++ b/internal/ceres/sparse_matrix.h
@@ -69,19 +69,28 @@
   ~SparseMatrix() override;
 
   // y += Ax;
+  using LinearOperator::RightMultiplyAndAccumulate;
   void RightMultiplyAndAccumulate(const double* x,
                                   double* y) const override = 0;
+
   // y += A'x;
   void LeftMultiplyAndAccumulate(const double* x, double* y) const override = 0;
 
   // In MATLAB notation sum(A.*A, 1)
   virtual void SquaredColumnNorm(double* x) const = 0;
+  virtual void SquaredColumnNorm(double* x,
+                                 ContextImpl* context,
+                                 int num_threads) const;
   // A = A * diag(scale)
   virtual void ScaleColumns(const double* scale) = 0;
+  virtual void ScaleColumns(const double* scale,
+                            ContextImpl* context,
+                            int num_threads);
 
   // A = 0. A->num_nonzeros() == 0 is true after this call. The
   // sparsity pattern is preserved.
   virtual void SetZero() = 0;
+  virtual void SetZero(ContextImpl* contex, int num_threads) { SetZero(); }
 
   // Resize and populate dense_matrix with a dense version of the
   // sparse matrix.
diff --git a/internal/ceres/trust_region_minimizer.cc b/internal/ceres/trust_region_minimizer.cc
index 800b676..47f7fd5 100644
--- a/internal/ceres/trust_region_minimizer.cc
+++ b/internal/ceres/trust_region_minimizer.cc
@@ -42,9 +42,11 @@
 #include "Eigen/Core"
 #include "ceres/array_utils.h"
 #include "ceres/coordinate_descent_minimizer.h"
+#include "ceres/eigen_vector_ops.h"
 #include "ceres/evaluator.h"
 #include "ceres/file.h"
 #include "ceres/line_search.h"
+#include "ceres/parallel_for.h"
 #include "ceres/stringprintf.h"
 #include "ceres/types.h"
 #include "ceres/wall_time.h"
@@ -268,7 +270,8 @@
     }
 
     // jacobian = jacobian * diag(J'J) ^{-1}
-    jacobian_->ScaleColumns(jacobian_scaling_.data());
+    jacobian_->ScaleColumns(
+        jacobian_scaling_.data(), options_.context, options_.num_threads);
   }
 
   // The gradient exists in the local tangent space. To account for
@@ -419,11 +422,15 @@
   //  = f'f/2  - 1/2 [ f'f + 2f'J * step + step' * J' * J * step]
   //  = -f'J * step - step' * J' * J * step / 2
   //  = -(J * step)'(f + J * step / 2)
-  model_residuals_.setZero();
+  ParallelSetZero(options_.context, options_.num_threads, model_residuals_);
   jacobian_->RightMultiplyAndAccumulate(trust_region_step_.data(),
-                                        model_residuals_.data());
-  model_cost_change_ =
-      -model_residuals_.dot(residuals_ + model_residuals_ / 2.0);
+                                        model_residuals_.data(),
+                                        options_.context,
+                                        options_.num_threads);
+  model_cost_change_ = -Dot(model_residuals_,
+                            residuals_ + model_residuals_ / 2.0,
+                            options_.context,
+                            options_.num_threads);
 
   // TODO(sameeragarwal)
   //
@@ -433,7 +440,10 @@
   iteration_summary_.step_is_valid = (model_cost_change_ > 0.0);
   if (iteration_summary_.step_is_valid) {
     // Undo the Jacobian column scaling.
-    delta_ = (trust_region_step_.array() * jacobian_scaling_.array()).matrix();
+    ParallelAssign(options_.context,
+                   options_.num_threads,
+                   delta_,
+                   (trust_region_step_.array() * jacobian_scaling_.array()));
     num_consecutive_invalid_steps_ = 0;
   }
 
diff --git a/internal/ceres/trust_region_preprocessor.cc b/internal/ceres/trust_region_preprocessor.cc
index 84e783f..67f928e 100644
--- a/internal/ceres/trust_region_preprocessor.cc
+++ b/internal/ceres/trust_region_preprocessor.cc
@@ -355,6 +355,8 @@
   strategy_options.trust_region_strategy_type =
       options.trust_region_strategy_type;
   strategy_options.dogleg_type = options.dogleg_type;
+  strategy_options.context = pp->problem->context();
+  strategy_options.num_threads = options.num_threads;
   pp->minimizer_options.trust_region_strategy =
       TrustRegionStrategy::Create(strategy_options);
   CHECK(pp->minimizer_options.trust_region_strategy != nullptr);
diff --git a/internal/ceres/trust_region_strategy.h b/internal/ceres/trust_region_strategy.h
index 334f06f..2be10f3 100644
--- a/internal/ceres/trust_region_strategy.h
+++ b/internal/ceres/trust_region_strategy.h
@@ -73,6 +73,9 @@
 
     // Further specify which dogleg method to use
     DoglegType dogleg_type = TRADITIONAL_DOGLEG;
+
+    ContextImpl* context = nullptr;
+    int num_threads = 1;
   };
 
   // Factory.