Adds a ParallelFor wrapper for tbb::parallel_for.

This is in preparation for adding support for a c++11 based parallel
for implementation. The parallel for abstraction does not have the
ability to constrain the total number of threads in nested for loops.
This is solved by distributing the number of threads evenly between
the nested for loops. Adds a TODO to consolidate the next for loops
into a single loop that can be properly split between threads.

Tested by building with TBB and running tests.

Change-Id: I546973b9a4d19b9cdd53caff55d1c80bac8ea953
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index b7c99ac..c5fb259 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -88,6 +88,7 @@
     low_rank_inverse_hessian.cc
     minimizer.cc
     normal_prior.cc
+    parallel_for_tbb.cc
     parameter_block_ordering.cc
     partitioned_matrix_view.cc
     polynomial.cc
@@ -340,6 +341,7 @@
   ceres_test(normal_prior)
   ceres_test(numeric_diff_cost_function)
   ceres_test(ordered_groups)
+  ceres_test(parallel_for)
   ceres_test(parameter_block)
   ceres_test(parameter_block_ordering)
   ceres_test(partitioned_matrix_view)
diff --git a/internal/ceres/coordinate_descent_minimizer.cc b/internal/ceres/coordinate_descent_minimizer.cc
index df9c41f..884fbd2 100644
--- a/internal/ceres/coordinate_descent_minimizer.cc
+++ b/internal/ceres/coordinate_descent_minimizer.cc
@@ -31,8 +31,7 @@
 #include "ceres/coordinate_descent_minimizer.h"
 
 #ifdef CERES_USE_TBB
-#include <tbb/parallel_for.h>
-#include <tbb/task_arena.h>
+#include "ceres/parallel_for.h"
 #endif
 
 #include <iterator>
@@ -174,12 +173,10 @@
          j < independent_set_offsets_[i + 1];
          ++j) {
 #else
-    tbb::task_arena task_arena(num_inner_iteration_threads);
-
-    task_arena.execute([&]{
-      tbb::parallel_for(independent_set_offsets_[i],
-                        independent_set_offsets_[i + 1],
-                        [&](int j) {
+    ParallelFor(independent_set_offsets_[i],
+                independent_set_offsets_[i + 1],
+                num_inner_iteration_threads,
+                [&](int j) {
 #endif // !CERES_USE_TBB
 
       const ScopedThreadToken scoped_thread_token(&thread_token_provider);
@@ -217,7 +214,6 @@
     }
 #ifdef CERES_USE_TBB
   );
-  });
 #endif
   }
 
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc
index f2a345d..f8c510c 100644
--- a/internal/ceres/covariance_impl.cc
+++ b/internal/ceres/covariance_impl.cc
@@ -31,8 +31,7 @@
 #include "ceres/covariance_impl.h"
 
 #ifdef CERES_USE_TBB
-#include <tbb/parallel_for.h>
-#include <tbb/task_arena.h>
+#include "ceres/parallel_for.h"
 #endif
 
 #include <algorithm>
@@ -367,11 +366,16 @@
 #endif // CERES_NO_THREADS
 
 #ifdef CERES_USE_TBB
-  tbb::task_arena task_arena(num_threads);
-
-  task_arena.execute([&]{
-    tbb::parallel_for(0, num_parameters, [&](int i) {
-      tbb::parallel_for(i, num_parameters, [&](int j) {
+  // The parallel for abstraction does not have support for constraining the
+  // number of workers in nested parallel for loops. Consequently, we will try
+  // to evenly distribute the number of workers between the each parallel for
+  // loop.
+  // TODO(vitus): consolidate the nested for loops into a single loop which can
+  // be properly split between the threads.
+  const int num_outer_threads = std::sqrt(num_threads);
+  const int num_inner_threads = num_threads / num_outer_threads;
+  ParallelFor(0, num_parameters, num_outer_threads, [&](int i) {
+    ParallelFor(i, num_parameters, num_inner_threads, [&](int j) {
 #endif // CERES_USE_TBB
 
       int covariance_row_idx = cum_parameter_size[i];
@@ -403,7 +407,6 @@
 #ifdef CERES_USE_TBB
     );
   });
-  });
 #else
   }
 #endif // CERES_USE_TBB
@@ -730,10 +733,7 @@
 #ifndef CERES_USE_TBB
   for (int r = 0; r < num_cols; ++r) {
 #else
-  tbb::task_arena task_arena(num_threads);
-
-  task_arena.execute([&]{
-    tbb::parallel_for(0, num_cols, [&](int r) {
+  ParallelFor(0, num_cols, num_threads, [&](int r) {
 #endif // !CERES_USE_TBB
 
     const int row_begin = rows[r];
@@ -758,7 +758,6 @@
   }
 #ifdef CERES_USE_TBB
   );
-  });
 #endif // CERES_USE_TBB
 
   free(permutation);
@@ -934,10 +933,7 @@
 #ifndef CERES_USE_TBB
   for (int r = 0; r < num_cols; ++r) {
 #else
-  tbb::task_arena task_arena(num_threads);
-
-  task_arena.execute([&]{
-    tbb::parallel_for(0, num_cols, [&](int r) {
+  ParallelFor(0, num_cols, num_threads, [&](int r) {
 #endif // !CERES_USE_TBB
 
     const int row_begin = rows[r];
@@ -966,7 +962,6 @@
 
 #ifdef CERES_USE_TBB
   );
-  });
 #endif // CERES_USE_TBB
 
   event_logger.AddEvent("Inverse");
diff --git a/internal/ceres/parallel_for.h b/internal/ceres/parallel_for.h
new file mode 100644
index 0000000..9222884
--- /dev/null
+++ b/internal/ceres/parallel_for.h
@@ -0,0 +1,48 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2018 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vitus@google.com (Michael Vitus)
+
+#ifndef CERES_INTERNAL_PARALLEL_FOR_
+#define CERES_INTERNAL_PARALLEL_FOR_
+
+#include <functional>
+
+namespace ceres {
+namespace internal {
+
+// Execute the function for every element in the range [start, end) with at most
+// num_threads. It will execute all the work on the calling thread if
+// num_threads is 1.
+void ParallelFor(int start, int end, int num_threads,
+                 const std::function<void(int)>& function);
+
+}  // namespace internal
+}  // namespace ceres
+
+#endif  // CERES_INTERNAL_PARALLEL_FOR_H_
diff --git a/internal/ceres/parallel_for_tbb.cc b/internal/ceres/parallel_for_tbb.cc
new file mode 100644
index 0000000..38a744f
--- /dev/null
+++ b/internal/ceres/parallel_for_tbb.cc
@@ -0,0 +1,70 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2018 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vitus@google.com (Michael Vitus)
+
+// This include must come before any #ifndef check on Ceres compile options.
+#include "ceres/internal/port.h"
+
+#ifdef CERES_USE_TBB
+
+#include "ceres/parallel_for.h"
+
+#include <tbb/parallel_for.h>
+#include <tbb/task_arena.h>
+
+#include "glog/logging.h"
+
+namespace ceres {
+namespace internal {
+
+void ParallelFor(int start, int end, int num_threads,
+                 const std::function<void(int)>& function) {
+  CHECK_GT(num_threads, 0);
+  if (end <= start) {
+    return;
+  }
+
+  // Fast path for when it is single threaded.
+  if (num_threads == 1) {
+    for (int i = start; i < end; ++i) {
+      function(i);
+    }
+    return;
+  }
+
+  tbb::task_arena task_arena(num_threads);
+  task_arena.execute([&]{
+      tbb::parallel_for(start, end, function);
+    });
+}
+
+}  // namespace internal
+}  // namespace ceres
+
+#endif // CERES_USE_TBB
diff --git a/internal/ceres/parallel_for_test.cc b/internal/ceres/parallel_for_test.cc
new file mode 100644
index 0000000..eb10a3b
--- /dev/null
+++ b/internal/ceres/parallel_for_test.cc
@@ -0,0 +1,83 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2018 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vitus@google.com (Michael Vitus)
+
+// This include must come before any #ifndef check on Ceres compile options.
+#include "ceres/internal/port.h"
+
+#ifdef CERES_USE_TBB
+
+#include "ceres/parallel_for.h"
+
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace ceres {
+namespace internal {
+
+using testing::ElementsAreArray;
+
+// Tests the parallel for loop computes the correct result for various number of
+// threads.
+TEST(ParallelFor, NumThreads) {
+  const int size = 16;
+  std::vector<int> expected_results(size, 0);
+  for (int i = 0; i < size; ++i) {
+    expected_results[i] = std::sqrt(i);
+  }
+
+  for (int num_threads = 1; num_threads <= 8; ++num_threads) {
+    std::vector<int> values(size, 0);
+    ParallelFor(0, size, num_threads,
+                [&values](int i) { values[i] = std::sqrt(i); });
+    EXPECT_THAT(values, ElementsAreArray(expected_results));
+  }
+}
+
+// Tests nested for loops do not result in a deadlock.
+TEST(ParallelFor, NestedParallelForDeadlock) {
+  // Increment each element in the 2D matrix.
+  std::vector<std::vector<int>> x(3, {1, 2, 3});
+  ParallelFor(0, 3, 2, [&x](int i) {
+    std::vector<int>& y = x.at(i);
+    ParallelFor(0, 3, 2, [&y](int j) { ++y.at(j); });
+  });
+
+  const std::vector<int> results = {2, 3, 4};
+  for (const std::vector<int>& value : x) {
+    EXPECT_THAT(value, ElementsAreArray(results));
+  }
+}
+
+}  // namespace internal
+}  // namespace ceres
+
+#endif // CERES_USE_TBB
diff --git a/internal/ceres/program_evaluator.h b/internal/ceres/program_evaluator.h
index b412747..36700b9 100644
--- a/internal/ceres/program_evaluator.h
+++ b/internal/ceres/program_evaluator.h
@@ -98,8 +98,7 @@
 #ifdef CERES_USE_TBB
 #include <atomic>
 
-#include <tbb/parallel_for.h>
-#include <tbb/task_arena.h>
+#include "ceres/parallel_for.h"
 #endif
 
 namespace ceres {
@@ -196,10 +195,8 @@
 
 #ifdef CERES_USE_TBB
     std::atomic_bool abort(false);
-    tbb::task_arena task_arena(options_.num_threads);
 
-    task_arena.execute([&]{
-        tbb::parallel_for(0, num_residual_blocks, [&](int i) {
+    ParallelFor(0, num_residual_blocks, options_.num_threads, [&](int i) {
 #endif // CERES_USE_TBB
 
       if (abort) {
@@ -290,7 +287,6 @@
     }
 #ifdef CERES_USE_TBB
     );
-    });
 #endif // CERES_USE_TBB
 
     if (!abort) {
diff --git a/internal/ceres/schur_eliminator_impl.h b/internal/ceres/schur_eliminator_impl.h
index 1bc4d8e..1776987 100644
--- a/internal/ceres/schur_eliminator_impl.h
+++ b/internal/ceres/schur_eliminator_impl.h
@@ -67,8 +67,7 @@
 #include "glog/logging.h"
 
 #ifdef CERES_USE_TBB
-#include <tbb/parallel_for.h>
-#include <tbb/task_arena.h>
+#include "ceres/parallel_for.h"
 #endif
 
 namespace ceres {
@@ -198,10 +197,8 @@
 #ifndef CERES_USE_TBB
     for (int i = num_eliminate_blocks_; i < num_col_blocks; ++i) {
 #else
-    tbb::task_arena task_arena(num_threads_);
-
-    task_arena.execute([&]{
-      tbb::parallel_for(num_eliminate_blocks_, num_col_blocks, [&](int i) {
+    ParallelFor(num_eliminate_blocks_, num_col_blocks, num_threads_,
+                [&](int i) {
 #endif // !CERES_USE_TBB
 
       const int block_id = i - num_eliminate_blocks_;
@@ -222,7 +219,6 @@
     }
 #ifdef CERES_USE_TBB
     );
-    });
 #endif // CERES_USE_TBB
   }
 
@@ -248,10 +244,7 @@
 #ifndef CERES_USE_TBB
   for (int i = 0; i < chunks_.size(); ++i) {
 #else
-  tbb::task_arena task_arena(num_threads_);
-
-  task_arena.execute([&]{
-    tbb::parallel_for(0, int(chunks_.size()), [&](int i) {
+  ParallelFor(0, int(chunks_.size()), num_threads_, [&](int i) {
 #endif // !CERES_USE_TBB
 
     const ScopedThreadToken scoped_thread_token(&thread_token_provider);
@@ -323,7 +316,6 @@
   }
 #ifdef CERES_USE_TBB
   );
-  });
 #endif // CERES_USE_TBB
 
   // For rows with no e_blocks, the schur complement update reduces to
@@ -348,10 +340,7 @@
 #ifndef CERES_USE_TBB
   for (int i = 0; i < chunks_.size(); ++i) {
 #else
-  tbb::task_arena task_arena(num_threads_);
-
-  task_arena.execute([&]{
-    tbb::parallel_for(0, int(chunks_.size()), [&](int i) {
+  ParallelFor(0, int(chunks_.size()), num_threads_, [&](int i) {
 #endif // !CERES_USE_TBB
 
     const Chunk& chunk = chunks_[i];
@@ -411,7 +400,6 @@
   }
 #ifdef CERES_USE_TBB
   );
-  });
 #endif // CERES_USE_TBB
 }