Changes TBB to use tbb::task_arena instead of tbb::task_scheduler_init.
Fixes the current implementation where the desired number of threads may
not be honored if another tbb::task_scheduler_init is instantiated. We
are using tbb::task_arena to solve this which is only available in newer
versions of TBB.
Also increases the performance by not creating/destroying the TBB setup
via tbb::task_scheduler_init on every iteration evaluation. This
increases the performance in single threaded mode using TBB by 10x.
By not specifically calling tbb::task_scheduler_init, this will either
respect any active tbb::task_scheduler_init instantiations or use the
default TBB settings which is hardware dependent. Ceres will honor the
user's requested number of threads through the task_arenas.
Tested via compiling with TBB enabled and ran the unit tests.
Change-Id: I5538407563449cdb5a0eaf8b8ccab62263912110
diff --git a/internal/ceres/coordinate_descent_minimizer.cc b/internal/ceres/coordinate_descent_minimizer.cc
index 77e3bbf..df9c41f 100644
--- a/internal/ceres/coordinate_descent_minimizer.cc
+++ b/internal/ceres/coordinate_descent_minimizer.cc
@@ -32,7 +32,7 @@
#ifdef CERES_USE_TBB
#include <tbb/parallel_for.h>
-#include <tbb/task_scheduler_init.h>
+#include <tbb/task_arena.h>
#endif
#include <iterator>
@@ -156,7 +156,6 @@
continue;
}
-
const int num_inner_iteration_threads =
min(options.num_threads, num_problems);
evaluator_options_.num_threads =
@@ -175,11 +174,12 @@
j < independent_set_offsets_[i + 1];
++j) {
#else
- tbb::task_scheduler_init tbb_task_scheduler_init(
- num_inner_iteration_threads);
- tbb::parallel_for(independent_set_offsets_[i],
- independent_set_offsets_[i + 1],
- [&](int j) {
+ tbb::task_arena task_arena(num_inner_iteration_threads);
+
+ task_arena.execute([&]{
+ tbb::parallel_for(independent_set_offsets_[i],
+ independent_set_offsets_[i + 1],
+ [&](int j) {
#endif // !CERES_USE_TBB
const ScopedThreadToken scoped_thread_token(&thread_token_provider);
@@ -217,6 +217,7 @@
}
#ifdef CERES_USE_TBB
);
+ });
#endif
}
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc
index 81b3ba1..f2a345d 100644
--- a/internal/ceres/covariance_impl.cc
+++ b/internal/ceres/covariance_impl.cc
@@ -32,7 +32,7 @@
#ifdef CERES_USE_TBB
#include <tbb/parallel_for.h>
-#include <tbb/task_scheduler_init.h>
+#include <tbb/task_arena.h>
#endif
#include <algorithm>
@@ -367,9 +367,11 @@
#endif // CERES_NO_THREADS
#ifdef CERES_USE_TBB
- tbb::task_scheduler_init tbb_task_scheduler_init(num_threads);
- tbb::parallel_for(0, num_parameters, [&](int i) {
- tbb::parallel_for(i, num_parameters, [&](int j) {
+ tbb::task_arena task_arena(num_threads);
+
+ task_arena.execute([&]{
+ tbb::parallel_for(0, num_parameters, [&](int i) {
+ tbb::parallel_for(i, num_parameters, [&](int j) {
#endif // CERES_USE_TBB
int covariance_row_idx = cum_parameter_size[i];
@@ -401,6 +403,7 @@
#ifdef CERES_USE_TBB
);
});
+ });
#else
}
#endif // CERES_USE_TBB
@@ -727,8 +730,10 @@
#ifndef CERES_USE_TBB
for (int r = 0; r < num_cols; ++r) {
#else
- tbb::task_scheduler_init tbb_task_scheduler_init(num_threads);
- tbb::parallel_for(0, num_cols, [&](int r) {
+ tbb::task_arena task_arena(num_threads);
+
+ task_arena.execute([&]{
+ tbb::parallel_for(0, num_cols, [&](int r) {
#endif // !CERES_USE_TBB
const int row_begin = rows[r];
@@ -753,6 +758,7 @@
}
#ifdef CERES_USE_TBB
);
+ });
#endif // CERES_USE_TBB
free(permutation);
@@ -928,8 +934,10 @@
#ifndef CERES_USE_TBB
for (int r = 0; r < num_cols; ++r) {
#else
- tbb::task_scheduler_init tbb_task_scheduler_init(num_threads);
- tbb::parallel_for(0, num_cols, [&](int r) {
+ tbb::task_arena task_arena(num_threads);
+
+ task_arena.execute([&]{
+ tbb::parallel_for(0, num_cols, [&](int r) {
#endif // !CERES_USE_TBB
const int row_begin = rows[r];
@@ -958,6 +966,7 @@
#ifdef CERES_USE_TBB
);
+ });
#endif // CERES_USE_TBB
event_logger.AddEvent("Inverse");
diff --git a/internal/ceres/program_evaluator.h b/internal/ceres/program_evaluator.h
index 6049d94..a625b23 100644
--- a/internal/ceres/program_evaluator.h
+++ b/internal/ceres/program_evaluator.h
@@ -99,7 +99,7 @@
#include <atomic>
#include <tbb/parallel_for.h>
-#include <tbb/task_scheduler_init.h>
+#include <tbb/task_arena.h>
#endif
namespace ceres {
@@ -196,8 +196,10 @@
#ifdef CERES_USE_TBB
std::atomic_bool abort(false);
- tbb::task_scheduler_init tbb_task_scheduler_init(options_.num_threads);
- tbb::parallel_for(0, num_residual_blocks, [&](int i) {
+ tbb::task_arena task_arena(options_.num_threads);
+
+ task_arena.execute([&]{
+ tbb::parallel_for(0, num_residual_blocks, [&](int i) {
#endif // CERES_USE_TBB
if (abort) {
@@ -288,6 +290,7 @@
}
#ifdef CERES_USE_TBB
);
+ });
#endif // CERES_USE_TBB
if (!abort) {
diff --git a/internal/ceres/schur_eliminator_impl.h b/internal/ceres/schur_eliminator_impl.h
index 01409fd..1bc4d8e 100644
--- a/internal/ceres/schur_eliminator_impl.h
+++ b/internal/ceres/schur_eliminator_impl.h
@@ -68,7 +68,7 @@
#ifdef CERES_USE_TBB
#include <tbb/parallel_for.h>
-#include <tbb/task_scheduler_init.h>
+#include <tbb/task_arena.h>
#endif
namespace ceres {
@@ -198,8 +198,10 @@
#ifndef CERES_USE_TBB
for (int i = num_eliminate_blocks_; i < num_col_blocks; ++i) {
#else
- tbb::task_scheduler_init tbb_task_scheduler_init(num_threads_);
- tbb::parallel_for(num_eliminate_blocks_, num_col_blocks, [&](int i) {
+ tbb::task_arena task_arena(num_threads_);
+
+ task_arena.execute([&]{
+ tbb::parallel_for(num_eliminate_blocks_, num_col_blocks, [&](int i) {
#endif // !CERES_USE_TBB
const int block_id = i - num_eliminate_blocks_;
@@ -220,6 +222,7 @@
}
#ifdef CERES_USE_TBB
);
+ });
#endif // CERES_USE_TBB
}
@@ -245,8 +248,10 @@
#ifndef CERES_USE_TBB
for (int i = 0; i < chunks_.size(); ++i) {
#else
- tbb::task_scheduler_init tbb_task_scheduler_init(num_threads_);
- tbb::parallel_for(0, int(chunks_.size()), [&](int i) {
+ tbb::task_arena task_arena(num_threads_);
+
+ task_arena.execute([&]{
+ tbb::parallel_for(0, int(chunks_.size()), [&](int i) {
#endif // !CERES_USE_TBB
const ScopedThreadToken scoped_thread_token(&thread_token_provider);
@@ -317,7 +322,8 @@
thread_id, bs, inverse_ete, buffer, chunk.buffer_layout, lhs);
}
#ifdef CERES_USE_TBB
- );
+ );
+ });
#endif // CERES_USE_TBB
// For rows with no e_blocks, the schur complement update reduces to
@@ -342,8 +348,10 @@
#ifndef CERES_USE_TBB
for (int i = 0; i < chunks_.size(); ++i) {
#else
- tbb::task_scheduler_init tbb_task_scheduler_init(num_threads_);
- tbb::parallel_for(0, int(chunks_.size()), [&](int i) {
+ tbb::task_arena task_arena(num_threads_);
+
+ task_arena.execute([&]{
+ tbb::parallel_for(0, int(chunks_.size()), [&](int i) {
#endif // !CERES_USE_TBB
const Chunk& chunk = chunks_[i];
@@ -403,6 +411,7 @@
}
#ifdef CERES_USE_TBB
);
+ });
#endif // CERES_USE_TBB
}