Speed up locking when num_threads = 1.
This is done by locking a dummy mutex when num_threads = 1.
Before on Mac M1 Pro
BM_BlockSparseJacobiPreconditionerBA/1 55724955 ns 55150500 ns 12
BM_BlockSparseJacobiPreconditionerBA/2 32243968 ns 32119545 ns 22
BM_BlockSparseJacobiPreconditionerBA/4 21749220 ns 21448485 ns 33
BM_BlockSparseJacobiPreconditionerBA/8 31190360 ns 27924520 ns 25
BM_BlockSparseJacobiPreconditionerBA/16 31130365 ns 26186656 ns 32
BM_BlockCRSJacobiPreconditionerBA/1 60739399 ns 60737750 ns 12
BM_BlockCRSJacobiPreconditionerBA/2 35197331 ns 34524650 ns 20
BM_BlockCRSJacobiPreconditionerBA/4 21977577 ns 21241606 ns 33
BM_BlockCRSJacobiPreconditionerBA/8 31597485 ns 27892000 ns 25
BM_BlockCRSJacobiPreconditionerBA/16 31097307 ns 21841367 ns 30
BM_BlockSparseJacobiPreconditionerUnstructured/1 63510295 ns 63488833 ns 12
BM_BlockSparseJacobiPreconditionerUnstructured/2 34208964 ns 34063333 ns 21
BM_BlockSparseJacobiPreconditionerUnstructured/4 22443432 ns 22145455 ns 33
BM_BlockSparseJacobiPreconditionerUnstructured/8 24571793 ns 22801323 ns 31
BM_BlockSparseJacobiPreconditionerUnstructured/16 23507892 ns 20859250 ns 36
BM_BlockCRSJacobiPreconditionerUnstructured/1 63282292 ns 63280273 ns 11
BM_BlockCRSJacobiPreconditionerUnstructured/2 32994633 ns 32845810 ns 21
BM_BlockCRSJacobiPreconditionerUnstructured/4 18249372 ns 17526200 ns 40
BM_BlockCRSJacobiPreconditionerUnstructured/8 16539623 ns 15937341 ns 44
BM_BlockCRSJacobiPreconditionerUnstructured/16 16549527 ns 12850294 ns 51
After
--------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------------------------------------------
BM_BlockSparseJacobiPreconditionerBA/1 44348891 ns 44348875 ns 16
BM_BlockSparseJacobiPreconditionerBA/2 32840149 ns 32706476 ns 21
BM_BlockSparseJacobiPreconditionerBA/4 22318142 ns 21904419 ns 31
BM_BlockSparseJacobiPreconditionerBA/8 31322712 ns 27964120 ns 25
BM_BlockSparseJacobiPreconditionerBA/16 31742625 ns 26624577 ns 26
BM_BlockCRSJacobiPreconditionerBA/1 49870369 ns 49869714 ns 14
BM_BlockCRSJacobiPreconditionerBA/2 34901023 ns 34234900 ns 20
BM_BlockCRSJacobiPreconditionerBA/4 21946689 ns 21215394 ns 33
BM_BlockCRSJacobiPreconditionerBA/8 31461558 ns 27728360 ns 25
BM_BlockCRSJacobiPreconditionerBA/16 30792414 ns 23063968 ns 31
BM_BlockSparseJacobiPreconditionerUnstructured/1 62120649 ns 61979750 ns 12
BM_BlockSparseJacobiPreconditionerUnstructured/2 33806314 ns 33729526 ns 19
BM_BlockSparseJacobiPreconditionerUnstructured/4 22195685 ns 21831500 ns 32
BM_BlockSparseJacobiPreconditionerUnstructured/8 25003440 ns 22765452 ns 31
BM_BlockSparseJacobiPreconditionerUnstructured/16 24746505 ns 19425364 ns 33
BM_BlockCRSJacobiPreconditionerUnstructured/1 57506343 ns 57502077 ns 13
BM_BlockCRSJacobiPreconditionerUnstructured/2 33691442 ns 33584810 ns 21
BM_BlockCRSJacobiPreconditionerUnstructured/4 18121943 ns 17579050 ns 40
BM_BlockCRSJacobiPreconditionerUnstructured/8 17624991 ns 16086568 ns 44
BM_BlockCRSJacobiPreconditionerUnstructured/16 16493819 ns 13160882 ns 51
Change-Id: Ieac097f5e06a08b48170dcfb06b5145f1ee512e6
diff --git a/internal/ceres/parallel_for.h b/internal/ceres/parallel_for.h
index 3c3d887..234c7db 100644
--- a/internal/ceres/parallel_for.h
+++ b/internal/ceres/parallel_for.h
@@ -33,6 +33,7 @@
#define CERES_INTERNAL_PARALLEL_FOR_H_
#include <functional>
+#include <mutex>
#include "ceres/context_impl.h"
#include "ceres/internal/disable_warnings.h"
@@ -41,6 +42,13 @@
namespace ceres::internal {
+// Use a dummy mutex if num_threads = 1.
+inline decltype(auto) MakeConditionalLock(const int num_threads,
+ std::mutex& m) {
+ return (num_threads == 1) ? std::unique_lock<std::mutex>{}
+ : std::unique_lock<std::mutex>{m};
+}
+
// Returns the maximum number of threads supported by the threading backend
// Ceres was compiled with.
CERES_NO_EXPORT