Changes TBB to use tbb::task_arena instead of tbb::task_scheduler_init. Fixes the current implementation where the desired number of threads may not be honored if another tbb::task_scheduler_init is instantiated. We are using tbb::task_arena to solve this which is only available in newer versions of TBB. Also increases the performance by not creating/destroying the TBB setup via tbb::task_scheduler_init on every iteration evaluation. This increases the performance in single threaded mode using TBB by 10x. By not specifically calling tbb::task_scheduler_init, this will either respect any active tbb::task_scheduler_init instantiations or use the default TBB settings which is hardware dependent. Ceres will honor the user's requested number of threads through the task_arenas. Tested via compiling with TBB enabled and ran the unit tests. Change-Id: I5538407563449cdb5a0eaf8b8ccab62263912110
diff --git a/internal/ceres/coordinate_descent_minimizer.cc b/internal/ceres/coordinate_descent_minimizer.cc index 77e3bbf..df9c41f 100644 --- a/internal/ceres/coordinate_descent_minimizer.cc +++ b/internal/ceres/coordinate_descent_minimizer.cc
@@ -32,7 +32,7 @@ #ifdef CERES_USE_TBB #include <tbb/parallel_for.h> -#include <tbb/task_scheduler_init.h> +#include <tbb/task_arena.h> #endif #include <iterator> @@ -156,7 +156,6 @@ continue; } - const int num_inner_iteration_threads = min(options.num_threads, num_problems); evaluator_options_.num_threads = @@ -175,11 +174,12 @@ j < independent_set_offsets_[i + 1]; ++j) { #else - tbb::task_scheduler_init tbb_task_scheduler_init( - num_inner_iteration_threads); - tbb::parallel_for(independent_set_offsets_[i], - independent_set_offsets_[i + 1], - [&](int j) { + tbb::task_arena task_arena(num_inner_iteration_threads); + + task_arena.execute([&]{ + tbb::parallel_for(independent_set_offsets_[i], + independent_set_offsets_[i + 1], + [&](int j) { #endif // !CERES_USE_TBB const ScopedThreadToken scoped_thread_token(&thread_token_provider); @@ -217,6 +217,7 @@ } #ifdef CERES_USE_TBB ); + }); #endif }
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc index 81b3ba1..f2a345d 100644 --- a/internal/ceres/covariance_impl.cc +++ b/internal/ceres/covariance_impl.cc
@@ -32,7 +32,7 @@ #ifdef CERES_USE_TBB #include <tbb/parallel_for.h> -#include <tbb/task_scheduler_init.h> +#include <tbb/task_arena.h> #endif #include <algorithm> @@ -367,9 +367,11 @@ #endif // CERES_NO_THREADS #ifdef CERES_USE_TBB - tbb::task_scheduler_init tbb_task_scheduler_init(num_threads); - tbb::parallel_for(0, num_parameters, [&](int i) { - tbb::parallel_for(i, num_parameters, [&](int j) { + tbb::task_arena task_arena(num_threads); + + task_arena.execute([&]{ + tbb::parallel_for(0, num_parameters, [&](int i) { + tbb::parallel_for(i, num_parameters, [&](int j) { #endif // CERES_USE_TBB int covariance_row_idx = cum_parameter_size[i]; @@ -401,6 +403,7 @@ #ifdef CERES_USE_TBB ); }); + }); #else } #endif // CERES_USE_TBB @@ -727,8 +730,10 @@ #ifndef CERES_USE_TBB for (int r = 0; r < num_cols; ++r) { #else - tbb::task_scheduler_init tbb_task_scheduler_init(num_threads); - tbb::parallel_for(0, num_cols, [&](int r) { + tbb::task_arena task_arena(num_threads); + + task_arena.execute([&]{ + tbb::parallel_for(0, num_cols, [&](int r) { #endif // !CERES_USE_TBB const int row_begin = rows[r]; @@ -753,6 +758,7 @@ } #ifdef CERES_USE_TBB ); + }); #endif // CERES_USE_TBB free(permutation); @@ -928,8 +934,10 @@ #ifndef CERES_USE_TBB for (int r = 0; r < num_cols; ++r) { #else - tbb::task_scheduler_init tbb_task_scheduler_init(num_threads); - tbb::parallel_for(0, num_cols, [&](int r) { + tbb::task_arena task_arena(num_threads); + + task_arena.execute([&]{ + tbb::parallel_for(0, num_cols, [&](int r) { #endif // !CERES_USE_TBB const int row_begin = rows[r]; @@ -958,6 +966,7 @@ #ifdef CERES_USE_TBB ); + }); #endif // CERES_USE_TBB event_logger.AddEvent("Inverse");
diff --git a/internal/ceres/program_evaluator.h b/internal/ceres/program_evaluator.h index 6049d94..a625b23 100644 --- a/internal/ceres/program_evaluator.h +++ b/internal/ceres/program_evaluator.h
@@ -99,7 +99,7 @@ #include <atomic> #include <tbb/parallel_for.h> -#include <tbb/task_scheduler_init.h> +#include <tbb/task_arena.h> #endif namespace ceres { @@ -196,8 +196,10 @@ #ifdef CERES_USE_TBB std::atomic_bool abort(false); - tbb::task_scheduler_init tbb_task_scheduler_init(options_.num_threads); - tbb::parallel_for(0, num_residual_blocks, [&](int i) { + tbb::task_arena task_arena(options_.num_threads); + + task_arena.execute([&]{ + tbb::parallel_for(0, num_residual_blocks, [&](int i) { #endif // CERES_USE_TBB if (abort) { @@ -288,6 +290,7 @@ } #ifdef CERES_USE_TBB ); + }); #endif // CERES_USE_TBB if (!abort) {
diff --git a/internal/ceres/schur_eliminator_impl.h b/internal/ceres/schur_eliminator_impl.h index 01409fd..1bc4d8e 100644 --- a/internal/ceres/schur_eliminator_impl.h +++ b/internal/ceres/schur_eliminator_impl.h
@@ -68,7 +68,7 @@ #ifdef CERES_USE_TBB #include <tbb/parallel_for.h> -#include <tbb/task_scheduler_init.h> +#include <tbb/task_arena.h> #endif namespace ceres { @@ -198,8 +198,10 @@ #ifndef CERES_USE_TBB for (int i = num_eliminate_blocks_; i < num_col_blocks; ++i) { #else - tbb::task_scheduler_init tbb_task_scheduler_init(num_threads_); - tbb::parallel_for(num_eliminate_blocks_, num_col_blocks, [&](int i) { + tbb::task_arena task_arena(num_threads_); + + task_arena.execute([&]{ + tbb::parallel_for(num_eliminate_blocks_, num_col_blocks, [&](int i) { #endif // !CERES_USE_TBB const int block_id = i - num_eliminate_blocks_; @@ -220,6 +222,7 @@ } #ifdef CERES_USE_TBB ); + }); #endif // CERES_USE_TBB } @@ -245,8 +248,10 @@ #ifndef CERES_USE_TBB for (int i = 0; i < chunks_.size(); ++i) { #else - tbb::task_scheduler_init tbb_task_scheduler_init(num_threads_); - tbb::parallel_for(0, int(chunks_.size()), [&](int i) { + tbb::task_arena task_arena(num_threads_); + + task_arena.execute([&]{ + tbb::parallel_for(0, int(chunks_.size()), [&](int i) { #endif // !CERES_USE_TBB const ScopedThreadToken scoped_thread_token(&thread_token_provider); @@ -317,7 +322,8 @@ thread_id, bs, inverse_ete, buffer, chunk.buffer_layout, lhs); } #ifdef CERES_USE_TBB - ); + ); + }); #endif // CERES_USE_TBB // For rows with no e_blocks, the schur complement update reduces to @@ -342,8 +348,10 @@ #ifndef CERES_USE_TBB for (int i = 0; i < chunks_.size(); ++i) { #else - tbb::task_scheduler_init tbb_task_scheduler_init(num_threads_); - tbb::parallel_for(0, int(chunks_.size()), [&](int i) { + tbb::task_arena task_arena(num_threads_); + + task_arena.execute([&]{ + tbb::parallel_for(0, int(chunks_.size()), [&](int i) { #endif // !CERES_USE_TBB const Chunk& chunk = chunks_[i]; @@ -403,6 +411,7 @@ } #ifdef CERES_USE_TBB ); + }); #endif // CERES_USE_TBB }