Speed up locking when num_threads = 1.

This is done by locking a dummy mutex when num_threads = 1.

Before on Mac M1 Pro
BM_BlockSparseJacobiPreconditionerBA/1              55724955 ns     55150500 ns           12
BM_BlockSparseJacobiPreconditionerBA/2              32243968 ns     32119545 ns           22
BM_BlockSparseJacobiPreconditionerBA/4              21749220 ns     21448485 ns           33
BM_BlockSparseJacobiPreconditionerBA/8              31190360 ns     27924520 ns           25
BM_BlockSparseJacobiPreconditionerBA/16             31130365 ns     26186656 ns           32
BM_BlockCRSJacobiPreconditionerBA/1                 60739399 ns     60737750 ns           12
BM_BlockCRSJacobiPreconditionerBA/2                 35197331 ns     34524650 ns           20
BM_BlockCRSJacobiPreconditionerBA/4                 21977577 ns     21241606 ns           33
BM_BlockCRSJacobiPreconditionerBA/8                 31597485 ns     27892000 ns           25
BM_BlockCRSJacobiPreconditionerBA/16                31097307 ns     21841367 ns           30
BM_BlockSparseJacobiPreconditionerUnstructured/1    63510295 ns     63488833 ns           12
BM_BlockSparseJacobiPreconditionerUnstructured/2    34208964 ns     34063333 ns           21
BM_BlockSparseJacobiPreconditionerUnstructured/4    22443432 ns     22145455 ns           33
BM_BlockSparseJacobiPreconditionerUnstructured/8    24571793 ns     22801323 ns           31
BM_BlockSparseJacobiPreconditionerUnstructured/16   23507892 ns     20859250 ns           36
BM_BlockCRSJacobiPreconditionerUnstructured/1       63282292 ns     63280273 ns           11
BM_BlockCRSJacobiPreconditionerUnstructured/2       32994633 ns     32845810 ns           21
BM_BlockCRSJacobiPreconditionerUnstructured/4       18249372 ns     17526200 ns           40
BM_BlockCRSJacobiPreconditionerUnstructured/8       16539623 ns     15937341 ns           44
BM_BlockCRSJacobiPreconditionerUnstructured/16      16549527 ns     12850294 ns           51

After

--------------------------------------------------------------------------------------------
Benchmark                                                  Time             CPU   Iterations
--------------------------------------------------------------------------------------------
BM_BlockSparseJacobiPreconditionerBA/1              44348891 ns     44348875 ns           16
BM_BlockSparseJacobiPreconditionerBA/2              32840149 ns     32706476 ns           21
BM_BlockSparseJacobiPreconditionerBA/4              22318142 ns     21904419 ns           31
BM_BlockSparseJacobiPreconditionerBA/8              31322712 ns     27964120 ns           25
BM_BlockSparseJacobiPreconditionerBA/16             31742625 ns     26624577 ns           26
BM_BlockCRSJacobiPreconditionerBA/1                 49870369 ns     49869714 ns           14
BM_BlockCRSJacobiPreconditionerBA/2                 34901023 ns     34234900 ns           20
BM_BlockCRSJacobiPreconditionerBA/4                 21946689 ns     21215394 ns           33
BM_BlockCRSJacobiPreconditionerBA/8                 31461558 ns     27728360 ns           25
BM_BlockCRSJacobiPreconditionerBA/16                30792414 ns     23063968 ns           31
BM_BlockSparseJacobiPreconditionerUnstructured/1    62120649 ns     61979750 ns           12
BM_BlockSparseJacobiPreconditionerUnstructured/2    33806314 ns     33729526 ns           19
BM_BlockSparseJacobiPreconditionerUnstructured/4    22195685 ns     21831500 ns           32
BM_BlockSparseJacobiPreconditionerUnstructured/8    25003440 ns     22765452 ns           31
BM_BlockSparseJacobiPreconditionerUnstructured/16   24746505 ns     19425364 ns           33
BM_BlockCRSJacobiPreconditionerUnstructured/1       57506343 ns     57502077 ns           13
BM_BlockCRSJacobiPreconditionerUnstructured/2       33691442 ns     33584810 ns           21
BM_BlockCRSJacobiPreconditionerUnstructured/4       18121943 ns     17579050 ns           40
BM_BlockCRSJacobiPreconditionerUnstructured/8       17624991 ns     16086568 ns           44
BM_BlockCRSJacobiPreconditionerUnstructured/16      16493819 ns     13160882 ns           51

Change-Id: Ieac097f5e06a08b48170dcfb06b5145f1ee512e6
diff --git a/internal/ceres/parallel_for.h b/internal/ceres/parallel_for.h
index 3c3d887..234c7db 100644
--- a/internal/ceres/parallel_for.h
+++ b/internal/ceres/parallel_for.h
@@ -33,6 +33,7 @@
 #define CERES_INTERNAL_PARALLEL_FOR_H_
 
 #include <functional>
+#include <mutex>
 
 #include "ceres/context_impl.h"
 #include "ceres/internal/disable_warnings.h"
@@ -41,6 +42,13 @@
 
 namespace ceres::internal {
 
+// Use a dummy mutex if num_threads = 1.
+inline decltype(auto) MakeConditionalLock(const int num_threads,
+                                          std::mutex& m) {
+  return (num_threads == 1) ? std::unique_lock<std::mutex>{}
+                            : std::unique_lock<std::mutex>{m};
+}
+
 // Returns the maximum number of threads supported by the threading backend
 // Ceres was compiled with.
 CERES_NO_EXPORT