Speed up locking when num_threads = 1. This is done by locking a dummy mutex when num_threads = 1. Before on Mac M1 Pro BM_BlockSparseJacobiPreconditionerBA/1 55724955 ns 55150500 ns 12 BM_BlockSparseJacobiPreconditionerBA/2 32243968 ns 32119545 ns 22 BM_BlockSparseJacobiPreconditionerBA/4 21749220 ns 21448485 ns 33 BM_BlockSparseJacobiPreconditionerBA/8 31190360 ns 27924520 ns 25 BM_BlockSparseJacobiPreconditionerBA/16 31130365 ns 26186656 ns 32 BM_BlockCRSJacobiPreconditionerBA/1 60739399 ns 60737750 ns 12 BM_BlockCRSJacobiPreconditionerBA/2 35197331 ns 34524650 ns 20 BM_BlockCRSJacobiPreconditionerBA/4 21977577 ns 21241606 ns 33 BM_BlockCRSJacobiPreconditionerBA/8 31597485 ns 27892000 ns 25 BM_BlockCRSJacobiPreconditionerBA/16 31097307 ns 21841367 ns 30 BM_BlockSparseJacobiPreconditionerUnstructured/1 63510295 ns 63488833 ns 12 BM_BlockSparseJacobiPreconditionerUnstructured/2 34208964 ns 34063333 ns 21 BM_BlockSparseJacobiPreconditionerUnstructured/4 22443432 ns 22145455 ns 33 BM_BlockSparseJacobiPreconditionerUnstructured/8 24571793 ns 22801323 ns 31 BM_BlockSparseJacobiPreconditionerUnstructured/16 23507892 ns 20859250 ns 36 BM_BlockCRSJacobiPreconditionerUnstructured/1 63282292 ns 63280273 ns 11 BM_BlockCRSJacobiPreconditionerUnstructured/2 32994633 ns 32845810 ns 21 BM_BlockCRSJacobiPreconditionerUnstructured/4 18249372 ns 17526200 ns 40 BM_BlockCRSJacobiPreconditionerUnstructured/8 16539623 ns 15937341 ns 44 BM_BlockCRSJacobiPreconditionerUnstructured/16 16549527 ns 12850294 ns 51 After -------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations -------------------------------------------------------------------------------------------- BM_BlockSparseJacobiPreconditionerBA/1 44348891 ns 44348875 ns 16 BM_BlockSparseJacobiPreconditionerBA/2 32840149 ns 32706476 ns 21 BM_BlockSparseJacobiPreconditionerBA/4 22318142 ns 21904419 ns 31 BM_BlockSparseJacobiPreconditionerBA/8 31322712 ns 27964120 ns 25 BM_BlockSparseJacobiPreconditionerBA/16 31742625 ns 26624577 ns 26 BM_BlockCRSJacobiPreconditionerBA/1 49870369 ns 49869714 ns 14 BM_BlockCRSJacobiPreconditionerBA/2 34901023 ns 34234900 ns 20 BM_BlockCRSJacobiPreconditionerBA/4 21946689 ns 21215394 ns 33 BM_BlockCRSJacobiPreconditionerBA/8 31461558 ns 27728360 ns 25 BM_BlockCRSJacobiPreconditionerBA/16 30792414 ns 23063968 ns 31 BM_BlockSparseJacobiPreconditionerUnstructured/1 62120649 ns 61979750 ns 12 BM_BlockSparseJacobiPreconditionerUnstructured/2 33806314 ns 33729526 ns 19 BM_BlockSparseJacobiPreconditionerUnstructured/4 22195685 ns 21831500 ns 32 BM_BlockSparseJacobiPreconditionerUnstructured/8 25003440 ns 22765452 ns 31 BM_BlockSparseJacobiPreconditionerUnstructured/16 24746505 ns 19425364 ns 33 BM_BlockCRSJacobiPreconditionerUnstructured/1 57506343 ns 57502077 ns 13 BM_BlockCRSJacobiPreconditionerUnstructured/2 33691442 ns 33584810 ns 21 BM_BlockCRSJacobiPreconditionerUnstructured/4 18121943 ns 17579050 ns 40 BM_BlockCRSJacobiPreconditionerUnstructured/8 17624991 ns 16086568 ns 44 BM_BlockCRSJacobiPreconditionerUnstructured/16 16493819 ns 13160882 ns 51 Change-Id: Ieac097f5e06a08b48170dcfb06b5145f1ee512e6
diff --git a/internal/ceres/block_jacobi_preconditioner.cc b/internal/ceres/block_jacobi_preconditioner.cc index b7ee002..81eb419 100644 --- a/internal/ceres/block_jacobi_preconditioner.cc +++ b/internal/ceres/block_jacobi_preconditioner.cc
@@ -75,13 +75,14 @@ MatrixRef m(cell_info->values, row_stride, col_stride); ConstMatrixRef b( values + cell.position, row_block_size, col_block_size); - std::lock_guard<std::mutex> l(cell_info->m); + auto lock = + MakeConditionalLock(options_.num_threads, cell_info->m); // clang-format off MatrixTransposeMatrixMultiply<Eigen::Dynamic, Eigen::Dynamic, - Eigen::Dynamic,Eigen::Dynamic, 1>( - values + cell.position, row_block_size,col_block_size, - values + cell.position, row_block_size,col_block_size, - cell_info->values,r, c,row_stride,col_stride); + Eigen::Dynamic,Eigen::Dynamic, 1>( + values + cell.position, row_block_size,col_block_size, + values + cell.position, row_block_size,col_block_size, + cell_info->values,r, c,row_stride,col_stride); // clang-format on } }); @@ -193,7 +194,7 @@ // MatrixTransposeMatrixMultiply, otherwise we could use it // here to further speed up the following expression. auto b = row_block.middleCols(c, col_block_size); - std::lock_guard<std::mutex> l(locks_[col]); + auto lock = MakeConditionalLock(options_.num_threads, locks_[col]); m.noalias() += b.transpose() * b; c += col_block_size; }
diff --git a/internal/ceres/parallel_for.h b/internal/ceres/parallel_for.h index 3c3d887..234c7db 100644 --- a/internal/ceres/parallel_for.h +++ b/internal/ceres/parallel_for.h
@@ -33,6 +33,7 @@ #define CERES_INTERNAL_PARALLEL_FOR_H_ #include <functional> +#include <mutex> #include "ceres/context_impl.h" #include "ceres/internal/disable_warnings.h" @@ -41,6 +42,13 @@ namespace ceres::internal { +// Use a dummy mutex if num_threads = 1. +inline decltype(auto) MakeConditionalLock(const int num_threads, + std::mutex& m) { + return (num_threads == 1) ? std::unique_lock<std::mutex>{} + : std::unique_lock<std::mutex>{m}; +} + // Returns the maximum number of threads supported by the threading backend // Ceres was compiled with. CERES_NO_EXPORT
diff --git a/internal/ceres/schur_eliminator_impl.h b/internal/ceres/schur_eliminator_impl.h index 62b7487..884c0cf 100644 --- a/internal/ceres/schur_eliminator_impl.h +++ b/internal/ceres/schur_eliminator_impl.h
@@ -1,5 +1,5 @@ // Ceres Solver - A fast non-linear least squares minimizer -// Copyright 2015 Google Inc. All rights reserved. +// Copyright 2022 Google Inc. All rights reserved. // http://ceres-solver.org/ // // Redistribution and use in source and binary forms, with or without @@ -205,8 +205,6 @@ const int block_size = bs->cols[i].size; typename EigenTypes<Eigen::Dynamic>::ConstVectorRef diag( D + bs->cols[i].position, block_size); - - std::lock_guard<std::mutex> l(cell_info->m); MatrixRef m(cell_info->values, row_stride, col_stride); m.block(r, c, block_size, block_size).diagonal() += diag.array().square().matrix(); @@ -409,7 +407,7 @@ const int block_id = row.cells[c].block_id; const int block_size = bs->cols[block_id].size; const int block = block_id - num_eliminate_blocks_; - std::lock_guard<std::mutex> l(*rhs_locks_[block]); + auto lock = MakeConditionalLock(num_threads_, *rhs_locks_[block]); // clang-format off MatrixTransposeVectorMultiply<kRowBlockSize, kFBlockSize, 1>( values + row.cells[c].position, @@ -549,7 +547,7 @@ lhs->GetCell(block1, block2, &r, &c, &row_stride, &col_stride); if (cell_info != nullptr) { const int block2_size = bs->cols[it2->first].size; - std::lock_guard<std::mutex> l(cell_info->m); + auto lock = MakeConditionalLock(num_threads_, cell_info->m); // clang-format off MatrixMatrixMultiply <kFBlockSize, kEBlockSize, kEBlockSize, kFBlockSize, -1>( @@ -626,7 +624,7 @@ CellInfo* cell_info = lhs->GetCell(block1, block1, &r, &c, &row_stride, &col_stride); if (cell_info != nullptr) { - std::lock_guard<std::mutex> l(cell_info->m); + auto lock = MakeConditionalLock(num_threads_, cell_info->m); // This multiply currently ignores the fact that this is a // symmetric outer product. // clang-format off @@ -647,7 +645,7 @@ lhs->GetCell(block1, block2, &r, &c, &row_stride, &col_stride); if (cell_info != nullptr) { const int block2_size = bs->cols[row.cells[j].block_id].size; - std::lock_guard<std::mutex> l(cell_info->m); + auto lock = MakeConditionalLock(num_threads_, cell_info->m); // clang-format off MatrixTransposeMatrixMultiply <Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, 1>( @@ -681,7 +679,7 @@ CellInfo* cell_info = lhs->GetCell(block1, block1, &r, &c, &row_stride, &col_stride); if (cell_info != nullptr) { - std::lock_guard<std::mutex> l(cell_info->m); + auto lock = MakeConditionalLock(num_threads_, cell_info->m); // block += b1.transpose() * b1; // clang-format off MatrixTransposeMatrixMultiply @@ -702,7 +700,7 @@ lhs->GetCell(block1, block2, &r, &c, &row_stride, &col_stride); if (cell_info != nullptr) { // block += b1.transpose() * b2; - std::lock_guard<std::mutex> l(cell_info->m); + auto lock = MakeConditionalLock(num_threads_, cell_info->m); // clang-format off MatrixTransposeMatrixMultiply <kRowBlockSize, kFBlockSize, kRowBlockSize, kFBlockSize, 1>(