Changes TBB to use tbb::task_arena instead of tbb::task_scheduler_init.

Fixes the current implementation where the desired number of threads may
not be honored if another tbb::task_scheduler_init is instantiated. We
are using tbb::task_arena to solve this which is only available in newer
versions of TBB.

Also increases the performance by not creating/destroying the TBB setup
via tbb::task_scheduler_init on every iteration evaluation. This
increases the performance in single threaded mode using TBB by 10x.

By not specifically calling tbb::task_scheduler_init, this will either
respect any active tbb::task_scheduler_init instantiations or use the
default TBB settings which is hardware dependent. Ceres will honor the
user's requested number of threads through the task_arenas.

Tested via compiling with TBB enabled and ran the unit tests.

Change-Id: I5538407563449cdb5a0eaf8b8ccab62263912110
diff --git a/internal/ceres/coordinate_descent_minimizer.cc b/internal/ceres/coordinate_descent_minimizer.cc
index 77e3bbf..df9c41f 100644
--- a/internal/ceres/coordinate_descent_minimizer.cc
+++ b/internal/ceres/coordinate_descent_minimizer.cc
@@ -32,7 +32,7 @@
 
 #ifdef CERES_USE_TBB
 #include <tbb/parallel_for.h>
-#include <tbb/task_scheduler_init.h>
+#include <tbb/task_arena.h>
 #endif
 
 #include <iterator>
@@ -156,7 +156,6 @@
       continue;
     }
 
-
     const int num_inner_iteration_threads =
         min(options.num_threads, num_problems);
     evaluator_options_.num_threads =
@@ -175,11 +174,12 @@
          j < independent_set_offsets_[i + 1];
          ++j) {
 #else
-    tbb::task_scheduler_init tbb_task_scheduler_init(
-        num_inner_iteration_threads);
-    tbb::parallel_for(independent_set_offsets_[i],
-                      independent_set_offsets_[i + 1],
-                      [&](int j) {
+    tbb::task_arena task_arena(num_inner_iteration_threads);
+
+    task_arena.execute([&]{
+      tbb::parallel_for(independent_set_offsets_[i],
+                        independent_set_offsets_[i + 1],
+                        [&](int j) {
 #endif // !CERES_USE_TBB
 
       const ScopedThreadToken scoped_thread_token(&thread_token_provider);
@@ -217,6 +217,7 @@
     }
 #ifdef CERES_USE_TBB
   );
+  });
 #endif
   }
 
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc
index 81b3ba1..f2a345d 100644
--- a/internal/ceres/covariance_impl.cc
+++ b/internal/ceres/covariance_impl.cc
@@ -32,7 +32,7 @@
 
 #ifdef CERES_USE_TBB
 #include <tbb/parallel_for.h>
-#include <tbb/task_scheduler_init.h>
+#include <tbb/task_arena.h>
 #endif
 
 #include <algorithm>
@@ -367,9 +367,11 @@
 #endif // CERES_NO_THREADS
 
 #ifdef CERES_USE_TBB
-  tbb::task_scheduler_init tbb_task_scheduler_init(num_threads);
-  tbb::parallel_for(0, num_parameters, [&](int i) {
-    tbb::parallel_for(i, num_parameters, [&](int j) {
+  tbb::task_arena task_arena(num_threads);
+
+  task_arena.execute([&]{
+    tbb::parallel_for(0, num_parameters, [&](int i) {
+      tbb::parallel_for(i, num_parameters, [&](int j) {
 #endif // CERES_USE_TBB
 
       int covariance_row_idx = cum_parameter_size[i];
@@ -401,6 +403,7 @@
 #ifdef CERES_USE_TBB
     );
   });
+  });
 #else
   }
 #endif // CERES_USE_TBB
@@ -727,8 +730,10 @@
 #ifndef CERES_USE_TBB
   for (int r = 0; r < num_cols; ++r) {
 #else
-  tbb::task_scheduler_init tbb_task_scheduler_init(num_threads);
-  tbb::parallel_for(0, num_cols, [&](int r) {
+  tbb::task_arena task_arena(num_threads);
+
+  task_arena.execute([&]{
+    tbb::parallel_for(0, num_cols, [&](int r) {
 #endif // !CERES_USE_TBB
 
     const int row_begin = rows[r];
@@ -753,6 +758,7 @@
   }
 #ifdef CERES_USE_TBB
   );
+  });
 #endif // CERES_USE_TBB
 
   free(permutation);
@@ -928,8 +934,10 @@
 #ifndef CERES_USE_TBB
   for (int r = 0; r < num_cols; ++r) {
 #else
-  tbb::task_scheduler_init tbb_task_scheduler_init(num_threads);
-  tbb::parallel_for(0, num_cols, [&](int r) {
+  tbb::task_arena task_arena(num_threads);
+
+  task_arena.execute([&]{
+    tbb::parallel_for(0, num_cols, [&](int r) {
 #endif // !CERES_USE_TBB
 
     const int row_begin = rows[r];
@@ -958,6 +966,7 @@
 
 #ifdef CERES_USE_TBB
   );
+  });
 #endif // CERES_USE_TBB
 
   event_logger.AddEvent("Inverse");
diff --git a/internal/ceres/program_evaluator.h b/internal/ceres/program_evaluator.h
index 6049d94..a625b23 100644
--- a/internal/ceres/program_evaluator.h
+++ b/internal/ceres/program_evaluator.h
@@ -99,7 +99,7 @@
 #include <atomic>
 
 #include <tbb/parallel_for.h>
-#include <tbb/task_scheduler_init.h>
+#include <tbb/task_arena.h>
 #endif
 
 namespace ceres {
@@ -196,8 +196,10 @@
 
 #ifdef CERES_USE_TBB
     std::atomic_bool abort(false);
-    tbb::task_scheduler_init tbb_task_scheduler_init(options_.num_threads);
-    tbb::parallel_for(0, num_residual_blocks, [&](int i) {
+    tbb::task_arena task_arena(options_.num_threads);
+
+    task_arena.execute([&]{
+        tbb::parallel_for(0, num_residual_blocks, [&](int i) {
 #endif // CERES_USE_TBB
 
       if (abort) {
@@ -288,6 +290,7 @@
     }
 #ifdef CERES_USE_TBB
     );
+    });
 #endif // CERES_USE_TBB
 
     if (!abort) {
diff --git a/internal/ceres/schur_eliminator_impl.h b/internal/ceres/schur_eliminator_impl.h
index 01409fd..1bc4d8e 100644
--- a/internal/ceres/schur_eliminator_impl.h
+++ b/internal/ceres/schur_eliminator_impl.h
@@ -68,7 +68,7 @@
 
 #ifdef CERES_USE_TBB
 #include <tbb/parallel_for.h>
-#include <tbb/task_scheduler_init.h>
+#include <tbb/task_arena.h>
 #endif
 
 namespace ceres {
@@ -198,8 +198,10 @@
 #ifndef CERES_USE_TBB
     for (int i = num_eliminate_blocks_; i < num_col_blocks; ++i) {
 #else
-    tbb::task_scheduler_init tbb_task_scheduler_init(num_threads_);
-    tbb::parallel_for(num_eliminate_blocks_, num_col_blocks, [&](int i) {
+    tbb::task_arena task_arena(num_threads_);
+
+    task_arena.execute([&]{
+      tbb::parallel_for(num_eliminate_blocks_, num_col_blocks, [&](int i) {
 #endif // !CERES_USE_TBB
 
       const int block_id = i - num_eliminate_blocks_;
@@ -220,6 +222,7 @@
     }
 #ifdef CERES_USE_TBB
     );
+    });
 #endif // CERES_USE_TBB
   }
 
@@ -245,8 +248,10 @@
 #ifndef CERES_USE_TBB
   for (int i = 0; i < chunks_.size(); ++i) {
 #else
-  tbb::task_scheduler_init tbb_task_scheduler_init(num_threads_);
-  tbb::parallel_for(0, int(chunks_.size()), [&](int i) {
+  tbb::task_arena task_arena(num_threads_);
+
+  task_arena.execute([&]{
+    tbb::parallel_for(0, int(chunks_.size()), [&](int i) {
 #endif // !CERES_USE_TBB
 
     const ScopedThreadToken scoped_thread_token(&thread_token_provider);
@@ -317,7 +322,8 @@
         thread_id, bs, inverse_ete, buffer, chunk.buffer_layout, lhs);
   }
 #ifdef CERES_USE_TBB
-    );
+  );
+  });
 #endif // CERES_USE_TBB
 
   // For rows with no e_blocks, the schur complement update reduces to
@@ -342,8 +348,10 @@
 #ifndef CERES_USE_TBB
   for (int i = 0; i < chunks_.size(); ++i) {
 #else
-  tbb::task_scheduler_init tbb_task_scheduler_init(num_threads_);
-  tbb::parallel_for(0, int(chunks_.size()), [&](int i) {
+  tbb::task_arena task_arena(num_threads_);
+
+  task_arena.execute([&]{
+    tbb::parallel_for(0, int(chunks_.size()), [&](int i) {
 #endif // !CERES_USE_TBB
 
     const Chunk& chunk = chunks_[i];
@@ -403,6 +411,7 @@
   }
 #ifdef CERES_USE_TBB
   );
+  });
 #endif // CERES_USE_TBB
 }