Adds a ParallelFor wrapper for no threads and OpenMP.

With the addition of C++11 support we can simplify the parallel for code by
removing the ifdef branching.  Converts coordinate_descent_minimizer.cc to use
the thread_id ParallelFor API.

Tested by building with OpenMP, C++11 threads, TBB, and no threads.  All tests
pass.

Also compared timing via the bundle adjuster.

./bin/bundle_adjuster --input=../problem-744-543562-pre.txt

With OpenMP num_threads=8

Head:
Time (in seconds):
  Residual only evaluation           0.807753 (5)
  Jacobian & residual evaluation     4.489404 (6)
  Linear solver                     41.826481 (5)
Minimizer                           50.745857
Total                               73.294424

CL:
Time (in seconds):
  Residual only evaluation           0.970483 (5)
  Jacobian & residual evaluation     4.647438 (6)
  Linear solver                     41.781892 (5)
Minimizer                           50.848904
Total                               73.089983

With OpenMP num_threads=1

HEAD:
Time (in seconds):
  Residual only evaluation           2.990246 (5)
  Jacobian & residual evaluation    14.132090 (6)
  Linear solver                     79.631951 (5)
Minimizer                          100.281847
Total                              122.946267

CL:
Time (in seconds):
  Residual only evaluation           3.075178 (5)
  Jacobian & residual evaluation    13.966451 (6)
  Linear solver                     77.005441 (5)
Minimizer                           97.568712
Total                              120.410454

Change-Id: I1857d7943073be7465b6c6476bf46ab11c5475a3
diff --git a/bazel/ceres.bzl b/bazel/ceres.bzl
index 6ba0137..8963370 100644
--- a/bazel/ceres.bzl
+++ b/bazel/ceres.bzl
@@ -91,6 +91,7 @@
     "minimizer.cc",
     "normal_prior.cc",
     "parallel_for_cxx.cc",
+    "parallel_for_openmp.cc",
     "parallel_for_tbb.cc",
     "parallel_utils.cc",
     "parameter_block_ordering.cc",
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 8924173..1dad396 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -91,6 +91,7 @@
     low_rank_inverse_hessian.cc
     minimizer.cc
     normal_prior.cc
+    parallel_for_openmp.cc
     parallel_for_cxx.cc
     parallel_for_tbb.cc
     parallel_utils.cc
diff --git a/internal/ceres/coordinate_descent_minimizer.cc b/internal/ceres/coordinate_descent_minimizer.cc
index 6f20f20..558faed 100644
--- a/internal/ceres/coordinate_descent_minimizer.cc
+++ b/internal/ceres/coordinate_descent_minimizer.cc
@@ -30,10 +30,6 @@
 
 #include "ceres/coordinate_descent_minimizer.h"
 
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-#include "ceres/parallel_for.h"
-#endif
-
 #include <algorithm>
 #include <iterator>
 #include <memory>
@@ -43,14 +39,13 @@
 #include "ceres/evaluator.h"
 #include "ceres/linear_solver.h"
 #include "ceres/minimizer.h"
+#include "ceres/parallel_for.h"
 #include "ceres/parameter_block.h"
 #include "ceres/parameter_block_ordering.h"
 #include "ceres/problem_impl.h"
 #include "ceres/program.h"
 #include "ceres/residual_block.h"
-#include "ceres/scoped_thread_token.h"
 #include "ceres/solver.h"
-#include "ceres/thread_token_provider.h"
 #include "ceres/trust_region_minimizer.h"
 #include "ceres/trust_region_strategy.h"
 
@@ -164,62 +159,45 @@
     evaluator_options_.num_threads =
         max(1, options.num_threads / num_inner_iteration_threads);
 
-    ThreadTokenProvider thread_token_provider(num_inner_iteration_threads);
-
-#ifdef CERES_USE_OPENMP
     // The parameter blocks in each independent set can be optimized
     // in parallel, since they do not co-occur in any residual block.
-#pragma omp parallel for num_threads(num_inner_iteration_threads)
-#endif
+    ParallelFor(
+        context_,
+        independent_set_offsets_[i],
+        independent_set_offsets_[i + 1],
+        num_inner_iteration_threads,
+        [&](int thread_id, int j) {
+          ParameterBlock* parameter_block = parameter_blocks_[j];
+          const int old_index = parameter_block->index();
+          const int old_delta_offset = parameter_block->delta_offset();
+          parameter_block->SetVarying();
+          parameter_block->set_index(0);
+          parameter_block->set_delta_offset(0);
 
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-    for (int j = independent_set_offsets_[i];
-         j < independent_set_offsets_[i + 1];
-         ++j) {
-#else
-    ParallelFor(context_,
-                independent_set_offsets_[i],
-                independent_set_offsets_[i + 1],
-                num_inner_iteration_threads,
-                [&](int j) {
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
+          Program inner_program;
+          inner_program.mutable_parameter_blocks()->push_back(parameter_block);
+          *inner_program.mutable_residual_blocks() = residual_blocks_[j];
 
-      const ScopedThreadToken scoped_thread_token(&thread_token_provider);
-      const int thread_id = scoped_thread_token.token();
+          // TODO(sameeragarwal): Better error handling. Right now we
+          // assume that this is not going to lead to problems of any
+          // sort. Basically we should be checking for numerical failure
+          // of some sort.
+          //
+          // On the other hand, if the optimization is a failure, that in
+          // some ways is fine, since it won't change the parameters and
+          // we are fine.
+          Solver::Summary inner_summary;
+          Solve(&inner_program,
+                linear_solvers[thread_id],
+                parameters + parameter_block->state_offset(),
+                &inner_summary);
 
-      ParameterBlock* parameter_block = parameter_blocks_[j];
-      const int old_index = parameter_block->index();
-      const int old_delta_offset = parameter_block->delta_offset();
-      parameter_block->SetVarying();
-      parameter_block->set_index(0);
-      parameter_block->set_delta_offset(0);
-
-      Program inner_program;
-      inner_program.mutable_parameter_blocks()->push_back(parameter_block);
-      *inner_program.mutable_residual_blocks() = residual_blocks_[j];
-
-      // TODO(sameeragarwal): Better error handling. Right now we
-      // assume that this is not going to lead to problems of any
-      // sort. Basically we should be checking for numerical failure
-      // of some sort.
-      //
-      // On the other hand, if the optimization is a failure, that in
-      // some ways is fine, since it won't change the parameters and
-      // we are fine.
-      Solver::Summary inner_summary;
-      Solve(&inner_program,
-            linear_solvers[thread_id],
-            parameters + parameter_block->state_offset(),
-            &inner_summary);
-
-      parameter_block->set_index(old_index);
-      parameter_block->set_delta_offset(old_delta_offset);
-      parameter_block->SetState(parameters + parameter_block->state_offset());
-      parameter_block->SetConstant();
-    }
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-  );
-#endif
+          parameter_block->set_index(old_index);
+          parameter_block->set_delta_offset(old_delta_offset);
+          parameter_block->SetState(parameters +
+                                    parameter_block->state_offset());
+          parameter_block->SetConstant();
+        });
   }
 
   for (int i =  0; i < parameter_blocks_.size(); ++i) {
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc
index 2552416..7c63c2c 100644
--- a/internal/ceres/covariance_impl.cc
+++ b/internal/ceres/covariance_impl.cc
@@ -30,10 +30,6 @@
 
 #include "ceres/covariance_impl.h"
 
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-#include "ceres/parallel_for.h"
-#endif
-
 #include <algorithm>
 #include <cstdlib>
 #include <memory>
@@ -53,13 +49,12 @@
 #include "ceres/crs_matrix.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/map_util.h"
+#include "ceres/parallel_for.h"
 #include "ceres/parallel_utils.h"
 #include "ceres/parameter_block.h"
 #include "ceres/problem_impl.h"
 #include "ceres/residual_block.h"
-#include "ceres/scoped_thread_token.h"
 #include "ceres/suitesparse.h"
-#include "ceres/thread_token_provider.h"
 #include "ceres/wall_time.h"
 #include "glog/logging.h"
 
@@ -344,60 +339,41 @@
 
   bool success = true;
 
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-  ThreadTokenProvider thread_token_provider(num_threads);
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
   // Technically the following code is a double nested loop where
   // i = 1:n, j = i:n.
   int iteration_count = (num_parameters * (num_parameters + 1)) / 2;
-#if defined(CERES_USE_OPENMP)
-#    pragma omp parallel for num_threads(num_threads) schedule(dynamic)
-#endif // CERES_USE_OPENMP
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-  for (int k = 0; k < iteration_count; ++k) {
-#else
   problem_->context()->EnsureMinimumThreads(num_threads);
-  ParallelFor(problem_->context(),
-              0,
-              iteration_count,
-              num_threads,
-              [&](int thread_id, int k) {
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-      int i, j;
-      LinearIndexToUpperTriangularIndex(k, num_parameters, &i, &j);
+  ParallelFor(
+      problem_->context(),
+      0,
+      iteration_count,
+      num_threads,
+      [&](int thread_id, int k) {
+        int i, j;
+        LinearIndexToUpperTriangularIndex(k, num_parameters, &i, &j);
 
-      int covariance_row_idx = cum_parameter_size[i];
-      int covariance_col_idx = cum_parameter_size[j];
-      int size_i = parameter_sizes[i];
-      int size_j = parameter_sizes[j];
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-      const ScopedThreadToken scoped_thread_token(&thread_token_provider);
-      const int thread_id = scoped_thread_token.token();
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-      double* covariance_block =
-          workspace.get() +
-          thread_id * max_covariance_block_size * max_covariance_block_size;
-      if (!GetCovarianceBlockInTangentOrAmbientSpace(
-              parameters[i], parameters[j], lift_covariance_to_ambient_space,
-              covariance_block)) {
-        success = false;
-      }
+        int covariance_row_idx = cum_parameter_size[i];
+        int covariance_col_idx = cum_parameter_size[j];
+        int size_i = parameter_sizes[i];
+        int size_j = parameter_sizes[j];
+        double* covariance_block =
+            workspace.get() + thread_id * max_covariance_block_size *
+            max_covariance_block_size;
+        if (!GetCovarianceBlockInTangentOrAmbientSpace(
+                parameters[i], parameters[j],
+                lift_covariance_to_ambient_space, covariance_block)) {
+          success = false;
+        }
 
-      covariance.block(covariance_row_idx, covariance_col_idx,
-                       size_i, size_j) =
-          MatrixRef(covariance_block, size_i, size_j);
+        covariance.block(covariance_row_idx, covariance_col_idx, size_i,
+                         size_j) = MatrixRef(covariance_block, size_i, size_j);
 
-      if (i != j) {
-        covariance.block(covariance_col_idx, covariance_row_idx,
-                         size_j, size_i) =
-            MatrixRef(covariance_block, size_i, size_j).transpose();
-
-      }
-    }
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-   );
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+        if (i != j) {
+          covariance.block(covariance_col_idx, covariance_row_idx,
+                           size_j, size_i) =
+              MatrixRef(covariance_block, size_i, size_j).transpose();
+        }
+      });
   return success;
 }
 
@@ -709,50 +685,27 @@
   const int num_threads = options_.num_threads;
   std::unique_ptr<double[]> workspace(new double[num_threads * num_cols]);
 
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-  ThreadTokenProvider thread_token_provider(num_threads);
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
-#ifdef CERES_USE_OPENMP
-#pragma omp parallel for num_threads(num_threads) schedule(dynamic)
-#endif // CERES_USE_OPENMP
-
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-  for (int r = 0; r < num_cols; ++r) {
-#else
   problem_->context()->EnsureMinimumThreads(num_threads);
-  ParallelFor(problem_->context(),
-              0,
-              num_cols,
-              num_threads,
-              [&](int thread_id, int r) {
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
-    const int row_begin = rows[r];
-    const int row_end = rows[r + 1];
-    if (row_end != row_begin) {
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-      const ScopedThreadToken scoped_thread_token(&thread_token_provider);
-      const int thread_id = scoped_thread_token.token();
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
-      double* solution = workspace.get() + thread_id * num_cols;
-      SolveRTRWithSparseRHS<SuiteSparse_long>(
-          num_cols,
-          static_cast<SuiteSparse_long*>(R->i),
-          static_cast<SuiteSparse_long*>(R->p),
-          static_cast<double*>(R->x),
-          inverse_permutation[r],
-          solution);
-      for (int idx = row_begin; idx < row_end; ++idx) {
-        const int c = cols[idx];
-        values[idx] = solution[inverse_permutation[c]];
-      }
-    }
-  }
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-  );
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+  ParallelFor(
+      problem_->context(),
+      0,
+      num_cols,
+      num_threads,
+      [&](int thread_id, int r) {
+        const int row_begin = rows[r];
+        const int row_end = rows[r + 1];
+        if (row_end != row_begin) {
+          double* solution = workspace.get() + thread_id * num_cols;
+          SolveRTRWithSparseRHS<SuiteSparse_long>(
+              num_cols, static_cast<SuiteSparse_long*>(R->i),
+              static_cast<SuiteSparse_long*>(R->p), static_cast<double*>(R->x),
+              inverse_permutation[r], solution);
+          for (int idx = row_begin; idx < row_end; ++idx) {
+            const int c = cols[idx];
+            values[idx] = solution[inverse_permutation[c]];
+          }
+        }
+      });
 
   free(permutation);
   cholmod_l_free_sparse(&R, &cc);
@@ -918,54 +871,33 @@
   const int num_threads = options_.num_threads;
   std::unique_ptr<double[]> workspace(new double[num_threads * num_cols]);
 
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-  ThreadTokenProvider thread_token_provider(num_threads);
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
-#ifdef CERES_USE_OPENMP
-#pragma omp parallel for num_threads(num_threads) schedule(dynamic)
-#endif // CERES_USE_OPENMP
-
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-  for (int r = 0; r < num_cols; ++r) {
-#else
   problem_->context()->EnsureMinimumThreads(num_threads);
-  ParallelFor(problem_->context(),
-              0,
+  ParallelFor(
+      problem_->context(),
+      0,
+      num_cols,
+      num_threads,
+      [&](int thread_id, int r) {
+        const int row_begin = rows[r];
+        const int row_end = rows[r + 1];
+        if (row_end != row_begin) {
+          double* solution = workspace.get() + thread_id * num_cols;
+          SolveRTRWithSparseRHS<int>(
               num_cols,
-              num_threads,
-              [&](int thread_id, int r) {
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
+              qr_solver.matrixR().innerIndexPtr(),
+              qr_solver.matrixR().outerIndexPtr(),
+              &qr_solver.matrixR().data().value(0),
+              inverse_permutation.indices().coeff(r),
+              solution);
 
-    const int row_begin = rows[r];
-    const int row_end = rows[r + 1];
-    if (row_end != row_begin) {
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-      const ScopedThreadToken scoped_thread_token(&thread_token_provider);
-      const int thread_id = scoped_thread_token.token();
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
-      double* solution = workspace.get() + thread_id * num_cols;
-      SolveRTRWithSparseRHS<int>(
-          num_cols,
-          qr_solver.matrixR().innerIndexPtr(),
-          qr_solver.matrixR().outerIndexPtr(),
-          &qr_solver.matrixR().data().value(0),
-          inverse_permutation.indices().coeff(r),
-          solution);
-
-      // Assign the values of the computed covariance using the
-      // inverse permutation used in the QR factorization.
-      for (int idx = row_begin; idx < row_end; ++idx) {
-       const int c = cols[idx];
-       values[idx] = solution[inverse_permutation.indices().coeff(c)];
-      }
-    }
-  }
-
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-  );
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+          // Assign the values of the computed covariance using the
+          // inverse permutation used in the QR factorization.
+          for (int idx = row_begin; idx < row_end; ++idx) {
+            const int c = cols[idx];
+            values[idx] = solution[inverse_permutation.indices().coeff(c)];
+          }
+        }
+      });
 
   event_logger.AddEvent("Inverse");
 
diff --git a/internal/ceres/parallel_for_openmp.cc b/internal/ceres/parallel_for_openmp.cc
new file mode 100644
index 0000000..b64f3d7
--- /dev/null
+++ b/internal/ceres/parallel_for_openmp.cc
@@ -0,0 +1,83 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2018 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vitus@google.com (Michael Vitus)
+
+// This include must come before any #ifndef check on Ceres compile options.
+#include "ceres/internal/port.h"
+
+#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
+
+#include "ceres/parallel_for.h"
+
+#include "ceres/scoped_thread_token.h"
+#include "ceres/thread_token_provider.h"
+#include "glog/logging.h"
+
+namespace ceres {
+namespace internal {
+
+void ParallelFor(ContextImpl* context,
+                 int start,
+                 int end,
+                 int num_threads,
+                 const std::function<void(int)>& function) {
+  CHECK_GT(num_threads, 0);
+  CHECK(context != NULL);
+  if (end <= start) {
+    return;
+  }
+
+#ifdef CERES_USE_OPENMP
+#pragma omp parallel for num_threads(num_threads) \
+    schedule(dynamic) if (num_threads > 1)
+#endif  // CERES_USE_OPENMP
+  for (int i = start; i < end; ++i) {
+    function(i);
+  }
+}
+
+void ParallelFor(ContextImpl* context,
+                 int start,
+                 int end,
+                 int num_threads,
+                 const std::function<void(int thread_id, int i)>& function) {
+  CHECK(context != NULL);
+
+  ThreadTokenProvider thread_token_provider(num_threads);
+  ParallelFor(context, start, end, num_threads, [&](int i) {
+    const ScopedThreadToken scoped_thread_token(&thread_token_provider);
+    const int thread_id = scoped_thread_token.token();
+    function(thread_id, i);
+  });
+}
+
+}  // namespace internal
+}  // namespace ceres
+
+#endif  // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
diff --git a/internal/ceres/parallel_for_test.cc b/internal/ceres/parallel_for_test.cc
index 2de4865..04e5783 100644
--- a/internal/ceres/parallel_for_test.cc
+++ b/internal/ceres/parallel_for_test.cc
@@ -31,8 +31,6 @@
 // This include must come before any #ifndef check on Ceres compile options.
 #include "ceres/internal/port.h"
 
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-
 #include "ceres/parallel_for.h"
 
 #include <cmath>
@@ -129,6 +127,8 @@
   }
 }
 
+// This test is only valid when multithreading support is enabled.
+#ifndef CERES_NO_THREADS
 TEST(ParallelForWithThreadId, UniqueThreadIds) {
   // Ensure the hardware supports more than 1 thread to ensure the test will
   // pass.
@@ -157,8 +157,7 @@
 
   EXPECT_THAT(x, UnorderedElementsAreArray({0,1}));
 }
+#endif  // CERES_NO_THREADS
 
 }  // namespace internal
 }  // namespace ceres
-
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
diff --git a/internal/ceres/program_evaluator.h b/internal/ceres/program_evaluator.h
index 17175a6..13f2e83 100644
--- a/internal/ceres/program_evaluator.h
+++ b/internal/ceres/program_evaluator.h
@@ -82,25 +82,20 @@
 // This include must come before any #ifndef check on Ceres compile options.
 #include "ceres/internal/port.h"
 
+#include <atomic>
 #include <map>
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "ceres/evaluation_callback.h"
 #include "ceres/execution_summary.h"
 #include "ceres/internal/eigen.h"
+#include "ceres/parallel_for.h"
 #include "ceres/parameter_block.h"
 #include "ceres/program.h"
 #include "ceres/residual_block.h"
-#include "ceres/scoped_thread_token.h"
 #include "ceres/small_blas.h"
-#include "ceres/thread_token_provider.h"
-
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-#include <atomic>
-
-#include "ceres/parallel_for.h"
-#endif
 
 namespace ceres {
 namespace internal {
@@ -183,128 +178,84 @@
     }
 
     const int num_residual_blocks = program_->NumResidualBlocks();
-
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-    ThreadTokenProvider thread_token_provider(options_.num_threads);
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
-#ifdef CERES_USE_OPENMP
-    // This bool is used to disable the loop if an error is encountered
-    // without breaking out of it. The remaining loop iterations are still run,
-    // but with an empty body, and so will finish quickly.
-    bool abort = false;
-#pragma omp parallel for num_threads(options_.num_threads)
-    for (int i = 0; i < num_residual_blocks; ++i) {
-// Disable the loop instead of breaking, as required by OpenMP.
-#pragma omp flush(abort)
-#endif // CERES_USE_OPENMP
-
-#ifdef CERES_NO_THREADS
-    bool abort = false;
-    for (int i = 0; i < num_residual_blocks; ++i) {
-#endif // CERES_NO_THREADS
-
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+    // This bool is used to disable the loop if an error is encountered without
+    // breaking out of it. The remaining loop iterations are still run, but with
+    // an empty body, and so will finish quickly.
     std::atomic_bool abort(false);
-
-    ParallelFor(options_.context,
-                0,
-                num_residual_blocks,
-                options_.num_threads,
-                [&](int thread_id, int i) {
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-
-      if (abort) {
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-        return;
-#else
-        continue;
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-      }
-
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-      const ScopedThreadToken scoped_thread_token(&thread_token_provider);
-      const int thread_id = scoped_thread_token.token();
-#endif  // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
-      EvaluatePreparer* preparer = &evaluate_preparers_[thread_id];
-      EvaluateScratch* scratch = &evaluate_scratch_[thread_id];
-
-      // Prepare block residuals if requested.
-      const ResidualBlock* residual_block = program_->residual_blocks()[i];
-      double* block_residuals = NULL;
-      if (residuals != NULL) {
-        block_residuals = residuals + residual_layout_[i];
-      } else if (gradient != NULL) {
-        block_residuals = scratch->residual_block_residuals.get();
-      }
-
-      // Prepare block jacobians if requested.
-      double** block_jacobians = NULL;
-      if (jacobian != NULL || gradient != NULL) {
-        preparer->Prepare(residual_block,
-                          i,
-                          jacobian,
-                          scratch->jacobian_block_ptrs.get());
-        block_jacobians = scratch->jacobian_block_ptrs.get();
-      }
-
-      // Evaluate the cost, residuals, and jacobians.
-      double block_cost;
-      if (!residual_block->Evaluate(
-              evaluate_options.apply_loss_function,
-              &block_cost,
-              block_residuals,
-              block_jacobians,
-              scratch->residual_block_evaluate_scratch.get())) {
-        abort = true;
-#ifdef CERES_USE_OPENMP
-// This ensures that the OpenMP threads have a consistent view of 'abort'. Do
-// the flush inside the failure case so that there is usually only one
-// synchronization point per loop iteration instead of two.
-#pragma omp flush(abort)
-#endif // CERES_USE_OPENMP
-
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-        return;
-#else
-        continue;
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-      }
-
-      scratch->cost += block_cost;
-
-      // Store the jacobians, if they were requested.
-      if (jacobian != NULL) {
-        jacobian_writer_.Write(i,
-                               residual_layout_[i],
-                               block_jacobians,
-                               jacobian);
-      }
-
-      // Compute and store the gradient, if it was requested.
-      if (gradient != NULL) {
-        int num_residuals = residual_block->NumResiduals();
-        int num_parameter_blocks = residual_block->NumParameterBlocks();
-        for (int j = 0; j < num_parameter_blocks; ++j) {
-          const ParameterBlock* parameter_block =
-              residual_block->parameter_blocks()[j];
-          if (parameter_block->IsConstant()) {
-            continue;
+    ParallelFor(
+        options_.context,
+        0,
+        num_residual_blocks,
+        options_.num_threads,
+        [&](int thread_id, int i) {
+          if (abort) {
+            return;
           }
 
-          MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
-              block_jacobians[j],
-              num_residuals,
-              parameter_block->LocalSize(),
-              block_residuals,
-              scratch->gradient.get() + parameter_block->delta_offset());
-        }
-      }
-    }
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-    );
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+          EvaluatePreparer* preparer = &evaluate_preparers_[thread_id];
+          EvaluateScratch* scratch = &evaluate_scratch_[thread_id];
+
+          // Prepare block residuals if requested.
+          const ResidualBlock* residual_block = program_->residual_blocks()[i];
+          double* block_residuals = NULL;
+          if (residuals != NULL) {
+            block_residuals = residuals + residual_layout_[i];
+          } else if (gradient != NULL) {
+            block_residuals = scratch->residual_block_residuals.get();
+          }
+
+          // Prepare block jacobians if requested.
+          double** block_jacobians = NULL;
+          if (jacobian != NULL || gradient != NULL) {
+            preparer->Prepare(residual_block,
+                              i,
+                              jacobian,
+                              scratch->jacobian_block_ptrs.get());
+            block_jacobians = scratch->jacobian_block_ptrs.get();
+          }
+
+          // Evaluate the cost, residuals, and jacobians.
+          double block_cost;
+          if (!residual_block->Evaluate(
+                  evaluate_options.apply_loss_function,
+                  &block_cost,
+                  block_residuals,
+                  block_jacobians,
+                  scratch->residual_block_evaluate_scratch.get())) {
+            abort = true;
+            return;
+          }
+
+          scratch->cost += block_cost;
+
+          // Store the jacobians, if they were requested.
+          if (jacobian != NULL) {
+            jacobian_writer_.Write(i,
+                                   residual_layout_[i],
+                                   block_jacobians,
+                                   jacobian);
+          }
+
+          // Compute and store the gradient, if it was requested.
+          if (gradient != NULL) {
+            int num_residuals = residual_block->NumResiduals();
+            int num_parameter_blocks = residual_block->NumParameterBlocks();
+            for (int j = 0; j < num_parameter_blocks; ++j) {
+              const ParameterBlock* parameter_block =
+                  residual_block->parameter_blocks()[j];
+              if (parameter_block->IsConstant()) {
+                continue;
+              }
+
+              MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+                  block_jacobians[j],
+                  num_residuals,
+                  parameter_block->LocalSize(),
+                  block_residuals,
+                  scratch->gradient.get() + parameter_block->delta_offset());
+            }
+          }
+        });
 
     if (!abort) {
       const int num_parameters = program_->NumEffectiveParameters();
diff --git a/internal/ceres/schur_eliminator_impl.h b/internal/ceres/schur_eliminator_impl.h
index da5d922..203dcc9 100644
--- a/internal/ceres/schur_eliminator_impl.h
+++ b/internal/ceres/schur_eliminator_impl.h
@@ -51,6 +51,7 @@
 #include <algorithm>
 #include <map>
 
+#include "Eigen/Dense"
 #include "ceres/block_random_access_matrix.h"
 #include "ceres/block_sparse_matrix.h"
 #include "ceres/block_structure.h"
@@ -58,18 +59,14 @@
 #include "ceres/internal/fixed_array.h"
 #include "ceres/invert_psd_matrix.h"
 #include "ceres/map_util.h"
+#include "ceres/parallel_for.h"
 #include "ceres/schur_eliminator.h"
 #include "ceres/scoped_thread_token.h"
 #include "ceres/small_blas.h"
 #include "ceres/stl_util.h"
 #include "ceres/thread_token_provider.h"
-#include "Eigen/Dense"
 #include "glog/logging.h"
 
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-#include "ceres/parallel_for.h"
-#endif
-
 namespace ceres {
 namespace internal {
 
@@ -190,43 +187,29 @@
 
   // Add the diagonal to the schur complement.
   if (D != NULL) {
-#ifdef CERES_USE_OPENMP
-#pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
-#endif // CERES_USE_OPENMP
+    ParallelFor(
+        context_,
+        num_eliminate_blocks_,
+        num_col_blocks,
+        num_threads_,
+        [&](int i) {
+          const int block_id = i - num_eliminate_blocks_;
+          int r, c, row_stride, col_stride;
+          CellInfo* cell_info = lhs->GetCell(block_id, block_id, &r, &c,
+                                             &row_stride, &col_stride);
+          if (cell_info != NULL) {
+            const int block_size = bs->cols[i].size;
+            typename EigenTypes<Eigen::Dynamic>::ConstVectorRef diag(
+                D + bs->cols[i].position, block_size);
 
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-    for (int i = num_eliminate_blocks_; i < num_col_blocks; ++i) {
-#else
-    ParallelFor(context_, num_eliminate_blocks_, num_col_blocks, num_threads_,
-                [&](int i) {
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
-      const int block_id = i - num_eliminate_blocks_;
-      int r, c, row_stride, col_stride;
-      CellInfo* cell_info = lhs->GetCell(block_id, block_id,
-                                         &r, &c,
-                                         &row_stride, &col_stride);
-      if (cell_info != NULL) {
-        const int block_size = bs->cols[i].size;
-        typename EigenTypes<Eigen::Dynamic>::ConstVectorRef
-            diag(D + bs->cols[i].position, block_size);
-
-        std::lock_guard<std::mutex> l(cell_info->m);
-        MatrixRef m(cell_info->values, row_stride, col_stride);
-        m.block(r, c, block_size, block_size).diagonal()
-            += diag.array().square().matrix();
-      }
-    }
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-    );
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+            std::lock_guard<std::mutex> l(cell_info->m);
+            MatrixRef m(cell_info->values, row_stride, col_stride);
+            m.block(r, c, block_size, block_size).diagonal() +=
+                diag.array().square().matrix();
+          }
+        });
   }
 
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-  ThreadTokenProvider thread_token_provider(num_threads_);
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
-#ifdef CERES_USE_OPENMP
   // Eliminate y blocks one chunk at a time.  For each chunk, compute
   // the entries of the normal equations and the gradient vector block
   // corresponding to the y block and then apply Gaussian elimination
@@ -240,88 +223,76 @@
   // z blocks that share a row block/residual term with the y
   // block. EliminateRowOuterProduct does the corresponding operation
   // for the lhs of the reduced linear system.
-#pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
-#endif // CERES_USE_OPENMP
+  ParallelFor(
+      context_,
+      0,
+      int(chunks_.size()),
+      num_threads_,
+      [&](int thread_id, int i) {
+        double* buffer = buffer_.get() + thread_id * buffer_size_;
+        const Chunk& chunk = chunks_[i];
+        const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
+        const int e_block_size = bs->cols[e_block_id].size;
 
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-  for (int i = 0; i < chunks_.size(); ++i) {
-    const ScopedThreadToken scoped_thread_token(&thread_token_provider);
-    const int thread_id = scoped_thread_token.token();
-#else
-    ParallelFor(context_,
-                0,
-                int(chunks_.size()),
-                num_threads_,
-                [&](int thread_id, int i) {
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
+        VectorRef(buffer, buffer_size_).setZero();
 
-    double* buffer = buffer_.get() + thread_id * buffer_size_;
-    const Chunk& chunk = chunks_[i];
-    const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
-    const int e_block_size = bs->cols[e_block_id].size;
+        typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix
+            ete(e_block_size, e_block_size);
 
-    VectorRef(buffer, buffer_size_).setZero();
+        if (D != NULL) {
+          const typename EigenTypes<kEBlockSize>::ConstVectorRef
+              diag(D + bs->cols[e_block_id].position, e_block_size);
+          ete = diag.array().square().matrix().asDiagonal();
+        } else {
+          ete.setZero();
+        }
 
-    typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix
-        ete(e_block_size, e_block_size);
+        FixedArray<double, 8> g(e_block_size);
+        typename EigenTypes<kEBlockSize>::VectorRef gref(g.get(), e_block_size);
+        gref.setZero();
 
-    if (D != NULL) {
-      const typename EigenTypes<kEBlockSize>::ConstVectorRef
-          diag(D + bs->cols[e_block_id].position, e_block_size);
-      ete = diag.array().square().matrix().asDiagonal();
-    } else {
-      ete.setZero();
-    }
+        // We are going to be computing
+        //
+        //   S += F'F - F'E(E'E)^{-1}E'F
+        //
+        // for each Chunk. The computation is broken down into a number of
+        // function calls as below.
 
-    FixedArray<double, 8> g(e_block_size);
-    typename EigenTypes<kEBlockSize>::VectorRef gref(g.get(), e_block_size);
-    gref.setZero();
+        // Compute the outer product of the e_blocks with themselves (ete
+        // = E'E). Compute the product of the e_blocks with the
+        // corresonding f_blocks (buffer = E'F), the gradient of the terms
+        // in this chunk (g) and add the outer product of the f_blocks to
+        // Schur complement (S += F'F).
+        ChunkDiagonalBlockAndGradient(
+            chunk, A, b, chunk.start, &ete, g.get(), buffer, lhs);
 
-    // We are going to be computing
-    //
-    //   S += F'F - F'E(E'E)^{-1}E'F
-    //
-    // for each Chunk. The computation is broken down into a number of
-    // function calls as below.
+        // Normally one wouldn't compute the inverse explicitly, but
+        // e_block_size will typically be a small number like 3, in
+        // which case its much faster to compute the inverse once and
+        // use it to multiply other matrices/vectors instead of doing a
+        // Solve call over and over again.
+        typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix inverse_ete =
+            InvertPSDMatrix<kEBlockSize>(assume_full_rank_ete_, ete);
 
-    // Compute the outer product of the e_blocks with themselves (ete
-    // = E'E). Compute the product of the e_blocks with the
-    // corresonding f_blocks (buffer = E'F), the gradient of the terms
-    // in this chunk (g) and add the outer product of the f_blocks to
-    // Schur complement (S += F'F).
-    ChunkDiagonalBlockAndGradient(
-        chunk, A, b, chunk.start, &ete, g.get(), buffer, lhs);
+        // For the current chunk compute and update the rhs of the reduced
+        // linear system.
+        //
+        //   rhs = F'b - F'E(E'E)^(-1) E'b
 
-    // Normally one wouldn't compute the inverse explicitly, but
-    // e_block_size will typically be a small number like 3, in
-    // which case its much faster to compute the inverse once and
-    // use it to multiply other matrices/vectors instead of doing a
-    // Solve call over and over again.
-    typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix inverse_ete =
-        InvertPSDMatrix<kEBlockSize>(assume_full_rank_ete_, ete);
+        FixedArray<double, 8> inverse_ete_g(e_block_size);
+        MatrixVectorMultiply<kEBlockSize, kEBlockSize, 0>(
+            inverse_ete.data(),
+            e_block_size,
+            e_block_size,
+            g.get(),
+            inverse_ete_g.get());
 
-    // For the current chunk compute and update the rhs of the reduced
-    // linear system.
-    //
-    //   rhs = F'b - F'E(E'E)^(-1) E'b
+        UpdateRhs(chunk, A, b, chunk.start, inverse_ete_g.get(), rhs);
 
-    FixedArray<double, 8> inverse_ete_g(e_block_size);
-    MatrixVectorMultiply<kEBlockSize, kEBlockSize, 0>(
-        inverse_ete.data(),
-        e_block_size,
-        e_block_size,
-        g.get(),
-        inverse_ete_g.get());
-
-    UpdateRhs(chunk, A, b, chunk.start, inverse_ete_g.get(), rhs);
-
-    // S -= F'E(E'E)^{-1}E'F
-    ChunkOuterProduct(
+        // S -= F'E(E'E)^{-1}E'F
+        ChunkOuterProduct(
         thread_id, bs, inverse_ete, buffer, chunk.buffer_layout, lhs);
-  }
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-  );
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+      });
 
   // For rows with no e_blocks, the schur complement update reduces to
   // S += F'F.
@@ -338,21 +309,17 @@
                double* y) {
   const CompressedRowBlockStructure* bs = A->block_structure();
 
-#ifdef CERES_USE_OPENMP
-#pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
-#endif // CERES_USE_OPENMP
-
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-  for (int i = 0; i < chunks_.size(); ++i) {
-#else
-  ParallelFor(context_, 0, int(chunks_.size()), num_threads_, [&](int i) {
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
+  ParallelFor(
+      context_,
+      0,
+      int(chunks_.size()),
+      num_threads_,
+      [&](int i) {
     const Chunk& chunk = chunks_[i];
     const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
     const int e_block_size = bs->cols[e_block_id].size;
 
-    double* y_ptr = y +  bs->cols[e_block_id].position;
+    double* y_ptr = y + bs->cols[e_block_id].position;
     typename EigenTypes<kEBlockSize>::VectorRef y_block(y_ptr, e_block_size);
 
     typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix
@@ -395,17 +362,14 @@
 
       MatrixTransposeMatrixMultiply
           <kRowBlockSize, kEBlockSize, kRowBlockSize, kEBlockSize, 1>(
-              values + e_cell.position, row.block.size, e_block_size,
-              values + e_cell.position, row.block.size, e_block_size,
-              ete.data(), 0, 0, e_block_size, e_block_size);
+          values + e_cell.position, row.block.size, e_block_size,
+          values + e_cell.position, row.block.size, e_block_size,
+          ete.data(), 0, 0, e_block_size, e_block_size);
     }
 
-    y_block = InvertPSDMatrix<kEBlockSize>(assume_full_rank_ete_, ete)
-        * y_block;
-  }
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-  );
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+    y_block =
+        InvertPSDMatrix<kEBlockSize>(assume_full_rank_ete_, ete) * y_block;
+  });
 }
 
 // Update the rhs of the reduced linear system. Compute
diff --git a/jni/Android.mk b/jni/Android.mk
index 2c58147..80861dd 100644
--- a/jni/Android.mk
+++ b/jni/Android.mk
@@ -178,6 +178,7 @@
                    $(CERES_SRC_PATH)/low_rank_inverse_hessian.cc \
                    $(CERES_SRC_PATH)/minimizer.cc \
                    $(CERES_SRC_PATH)/normal_prior.cc \
+                   $(CERES_SRC_PATH)/parallel_for_openmp.cc \
                    $(CERES_SRC_PATH)/parameter_block_ordering.cc \
                    $(CERES_SRC_PATH)/partitioned_matrix_view.cc \
                    $(CERES_SRC_PATH)/polynomial.cc \