Speed up locking when num_threads = 1.

This is done by locking a dummy mutex when num_threads = 1.

Before on Mac M1 Pro
BM_BlockSparseJacobiPreconditionerBA/1              55724955 ns     55150500 ns           12
BM_BlockSparseJacobiPreconditionerBA/2              32243968 ns     32119545 ns           22
BM_BlockSparseJacobiPreconditionerBA/4              21749220 ns     21448485 ns           33
BM_BlockSparseJacobiPreconditionerBA/8              31190360 ns     27924520 ns           25
BM_BlockSparseJacobiPreconditionerBA/16             31130365 ns     26186656 ns           32
BM_BlockCRSJacobiPreconditionerBA/1                 60739399 ns     60737750 ns           12
BM_BlockCRSJacobiPreconditionerBA/2                 35197331 ns     34524650 ns           20
BM_BlockCRSJacobiPreconditionerBA/4                 21977577 ns     21241606 ns           33
BM_BlockCRSJacobiPreconditionerBA/8                 31597485 ns     27892000 ns           25
BM_BlockCRSJacobiPreconditionerBA/16                31097307 ns     21841367 ns           30
BM_BlockSparseJacobiPreconditionerUnstructured/1    63510295 ns     63488833 ns           12
BM_BlockSparseJacobiPreconditionerUnstructured/2    34208964 ns     34063333 ns           21
BM_BlockSparseJacobiPreconditionerUnstructured/4    22443432 ns     22145455 ns           33
BM_BlockSparseJacobiPreconditionerUnstructured/8    24571793 ns     22801323 ns           31
BM_BlockSparseJacobiPreconditionerUnstructured/16   23507892 ns     20859250 ns           36
BM_BlockCRSJacobiPreconditionerUnstructured/1       63282292 ns     63280273 ns           11
BM_BlockCRSJacobiPreconditionerUnstructured/2       32994633 ns     32845810 ns           21
BM_BlockCRSJacobiPreconditionerUnstructured/4       18249372 ns     17526200 ns           40
BM_BlockCRSJacobiPreconditionerUnstructured/8       16539623 ns     15937341 ns           44
BM_BlockCRSJacobiPreconditionerUnstructured/16      16549527 ns     12850294 ns           51

After

--------------------------------------------------------------------------------------------
Benchmark                                                  Time             CPU   Iterations
--------------------------------------------------------------------------------------------
BM_BlockSparseJacobiPreconditionerBA/1              44348891 ns     44348875 ns           16
BM_BlockSparseJacobiPreconditionerBA/2              32840149 ns     32706476 ns           21
BM_BlockSparseJacobiPreconditionerBA/4              22318142 ns     21904419 ns           31
BM_BlockSparseJacobiPreconditionerBA/8              31322712 ns     27964120 ns           25
BM_BlockSparseJacobiPreconditionerBA/16             31742625 ns     26624577 ns           26
BM_BlockCRSJacobiPreconditionerBA/1                 49870369 ns     49869714 ns           14
BM_BlockCRSJacobiPreconditionerBA/2                 34901023 ns     34234900 ns           20
BM_BlockCRSJacobiPreconditionerBA/4                 21946689 ns     21215394 ns           33
BM_BlockCRSJacobiPreconditionerBA/8                 31461558 ns     27728360 ns           25
BM_BlockCRSJacobiPreconditionerBA/16                30792414 ns     23063968 ns           31
BM_BlockSparseJacobiPreconditionerUnstructured/1    62120649 ns     61979750 ns           12
BM_BlockSparseJacobiPreconditionerUnstructured/2    33806314 ns     33729526 ns           19
BM_BlockSparseJacobiPreconditionerUnstructured/4    22195685 ns     21831500 ns           32
BM_BlockSparseJacobiPreconditionerUnstructured/8    25003440 ns     22765452 ns           31
BM_BlockSparseJacobiPreconditionerUnstructured/16   24746505 ns     19425364 ns           33
BM_BlockCRSJacobiPreconditionerUnstructured/1       57506343 ns     57502077 ns           13
BM_BlockCRSJacobiPreconditionerUnstructured/2       33691442 ns     33584810 ns           21
BM_BlockCRSJacobiPreconditionerUnstructured/4       18121943 ns     17579050 ns           40
BM_BlockCRSJacobiPreconditionerUnstructured/8       17624991 ns     16086568 ns           44
BM_BlockCRSJacobiPreconditionerUnstructured/16      16493819 ns     13160882 ns           51

Change-Id: Ieac097f5e06a08b48170dcfb06b5145f1ee512e6
diff --git a/internal/ceres/block_jacobi_preconditioner.cc b/internal/ceres/block_jacobi_preconditioner.cc
index b7ee002..81eb419 100644
--- a/internal/ceres/block_jacobi_preconditioner.cc
+++ b/internal/ceres/block_jacobi_preconditioner.cc
@@ -75,13 +75,14 @@
                   MatrixRef m(cell_info->values, row_stride, col_stride);
                   ConstMatrixRef b(
                       values + cell.position, row_block_size, col_block_size);
-                  std::lock_guard<std::mutex> l(cell_info->m);
+                  auto lock =
+                      MakeConditionalLock(options_.num_threads, cell_info->m);
                   // clang-format off
                   MatrixTransposeMatrixMultiply<Eigen::Dynamic, Eigen::Dynamic,
-                   Eigen::Dynamic,Eigen::Dynamic, 1>(
-                   values + cell.position, row_block_size,col_block_size,
-                   values + cell.position, row_block_size,col_block_size,
-                   cell_info->values,r, c,row_stride,col_stride);
+                      Eigen::Dynamic,Eigen::Dynamic, 1>(
+                          values + cell.position, row_block_size,col_block_size,
+                          values + cell.position, row_block_size,col_block_size,
+                          cell_info->values,r, c,row_stride,col_stride);
                   // clang-format on
                 }
               });
@@ -193,7 +194,7 @@
           // MatrixTransposeMatrixMultiply, otherwise we could use it
           // here to further speed up the following expression.
           auto b = row_block.middleCols(c, col_block_size);
-          std::lock_guard<std::mutex> l(locks_[col]);
+          auto lock = MakeConditionalLock(options_.num_threads, locks_[col]);
           m.noalias() += b.transpose() * b;
           c += col_block_size;
         }
diff --git a/internal/ceres/parallel_for.h b/internal/ceres/parallel_for.h
index 3c3d887..234c7db 100644
--- a/internal/ceres/parallel_for.h
+++ b/internal/ceres/parallel_for.h
@@ -33,6 +33,7 @@
 #define CERES_INTERNAL_PARALLEL_FOR_H_
 
 #include <functional>
+#include <mutex>
 
 #include "ceres/context_impl.h"
 #include "ceres/internal/disable_warnings.h"
@@ -41,6 +42,13 @@
 
 namespace ceres::internal {
 
+// Use a dummy mutex if num_threads = 1.
+inline decltype(auto) MakeConditionalLock(const int num_threads,
+                                          std::mutex& m) {
+  return (num_threads == 1) ? std::unique_lock<std::mutex>{}
+                            : std::unique_lock<std::mutex>{m};
+}
+
 // Returns the maximum number of threads supported by the threading backend
 // Ceres was compiled with.
 CERES_NO_EXPORT
diff --git a/internal/ceres/schur_eliminator_impl.h b/internal/ceres/schur_eliminator_impl.h
index 62b7487..884c0cf 100644
--- a/internal/ceres/schur_eliminator_impl.h
+++ b/internal/ceres/schur_eliminator_impl.h
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2022 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -205,8 +205,6 @@
                     const int block_size = bs->cols[i].size;
                     typename EigenTypes<Eigen::Dynamic>::ConstVectorRef diag(
                         D + bs->cols[i].position, block_size);
-
-                    std::lock_guard<std::mutex> l(cell_info->m);
                     MatrixRef m(cell_info->values, row_stride, col_stride);
                     m.block(r, c, block_size, block_size).diagonal() +=
                         diag.array().square().matrix();
@@ -409,7 +407,7 @@
       const int block_id = row.cells[c].block_id;
       const int block_size = bs->cols[block_id].size;
       const int block = block_id - num_eliminate_blocks_;
-      std::lock_guard<std::mutex> l(*rhs_locks_[block]);
+      auto lock = MakeConditionalLock(num_threads_, *rhs_locks_[block]);
       // clang-format off
       MatrixTransposeVectorMultiply<kRowBlockSize, kFBlockSize, 1>(
           values + row.cells[c].position,
@@ -549,7 +547,7 @@
           lhs->GetCell(block1, block2, &r, &c, &row_stride, &col_stride);
       if (cell_info != nullptr) {
         const int block2_size = bs->cols[it2->first].size;
-        std::lock_guard<std::mutex> l(cell_info->m);
+        auto lock = MakeConditionalLock(num_threads_, cell_info->m);
         // clang-format off
         MatrixMatrixMultiply
             <kFBlockSize, kEBlockSize, kEBlockSize, kFBlockSize, -1>(
@@ -626,7 +624,7 @@
     CellInfo* cell_info =
         lhs->GetCell(block1, block1, &r, &c, &row_stride, &col_stride);
     if (cell_info != nullptr) {
-      std::lock_guard<std::mutex> l(cell_info->m);
+      auto lock = MakeConditionalLock(num_threads_, cell_info->m);
       // This multiply currently ignores the fact that this is a
       // symmetric outer product.
       // clang-format off
@@ -647,7 +645,7 @@
           lhs->GetCell(block1, block2, &r, &c, &row_stride, &col_stride);
       if (cell_info != nullptr) {
         const int block2_size = bs->cols[row.cells[j].block_id].size;
-        std::lock_guard<std::mutex> l(cell_info->m);
+        auto lock = MakeConditionalLock(num_threads_, cell_info->m);
         // clang-format off
         MatrixTransposeMatrixMultiply
             <Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, 1>(
@@ -681,7 +679,7 @@
     CellInfo* cell_info =
         lhs->GetCell(block1, block1, &r, &c, &row_stride, &col_stride);
     if (cell_info != nullptr) {
-      std::lock_guard<std::mutex> l(cell_info->m);
+      auto lock = MakeConditionalLock(num_threads_, cell_info->m);
       // block += b1.transpose() * b1;
       // clang-format off
       MatrixTransposeMatrixMultiply
@@ -702,7 +700,7 @@
           lhs->GetCell(block1, block2, &r, &c, &row_stride, &col_stride);
       if (cell_info != nullptr) {
         // block += b1.transpose() * b2;
-        std::lock_guard<std::mutex> l(cell_info->m);
+        auto lock = MakeConditionalLock(num_threads_, cell_info->m);
         // clang-format off
         MatrixTransposeMatrixMultiply
             <kRowBlockSize, kFBlockSize, kRowBlockSize, kFBlockSize, 1>(