Add TBB threading support.

There are platforms where OpenMP is not available. This
patch adds support for Intel Threading Building Blocks (TBB)
as an alternative threading backend.

Change-Id: I94497d7cba0c3cfaccfc992169236f17fe948ae9
diff --git a/CMakeLists.txt b/CMakeLists.txt
index be553d8..b289008 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,6 +104,8 @@
        ON)
 # Multithreading using OpenMP
 option(OPENMP "Enable threaded solving in Ceres (requires OpenMP)" ON)
+# Multithreading using TBB
+option(TBB "Enable threaded solving in Ceres with TBB (requires TBB and C++11)" OFF)
 # Enable the use of Eigen as a sparse linear algebra library for
 # solving the nonlinear least squares problems. Enabling this
 # option will result in an LGPL licensed version of Ceres Solver
@@ -190,6 +192,7 @@
   update_cache_variable(CXSPARSE OFF)
   update_cache_variable(GFLAGS OFF)
   update_cache_variable(OPENMP OFF)
+  update_cache_variable(TBB OFF)
   # Apple claims that the BLAS call dsyrk_ is a private API, and will not allow
   # you to submit to the Apple Store if the symbol is present.
   update_cache_variable(LAPACK OFF)
@@ -404,6 +407,14 @@
   message("-- Disabling custom blas")
 endif (NOT CUSTOM_BLAS)
 
+# OpenMP and TBB are mutually exclusive options. OpenMP is on by default thus
+# disable it if the user requested TBB.
+if (TBB AND OPENMP)
+  update_cache_variable(OPENMP OFF)
+  message("-- Intel TBB enabled; disabling OpenMP support (they are mutually "
+    "exclusive)")
+endif (TBB AND OPENMP)
+
 if (OPENMP)
   # Find quietly, as we can continue without OpenMP if it is not found.
   find_package(OpenMP QUIET)
@@ -412,13 +423,6 @@
     list(APPEND CERES_COMPILE_OPTIONS CERES_USE_OPENMP)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-    if (UNIX)
-      # At least on Linux, we need pthreads to be enabled for mutex to
-      # compile.  This may not work on Windows or Android.
-      find_package(Threads REQUIRED)
-      list(APPEND CERES_COMPILE_OPTIONS CERES_HAVE_PTHREAD)
-      list(APPEND CERES_COMPILE_OPTIONS CERES_HAVE_RWLOCK)
-    endif (UNIX)
   else (OPENMP_FOUND)
     message("-- Failed to find OpenMP, disabling. This is expected on "
       "Clang < 3.8, and at least Xcode <= 8.  See Ceres documentation for "
@@ -427,8 +431,7 @@
     list(APPEND CERES_COMPILE_OPTIONS CERES_NO_THREADS)
   endif (OPENMP_FOUND)
 else (OPENMP)
-  message("-- Building without OpenMP (disabling multithreading).")
-  list(APPEND CERES_COMPILE_OPTIONS CERES_NO_THREADS)
+  message("-- Building without OpenMP, disabling.")
 endif (OPENMP)
 
 # Initialise CMAKE_REQUIRED_FLAGS used by CheckCXXSourceCompiles with the
@@ -436,6 +439,33 @@
 # they are used when discovering shared_ptr/unordered_map.
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_CXX_FLAGS})
 
+if (TBB)
+  find_package(TBB REQUIRED)
+  if (TBB_FOUND)
+    message("-- Building with TBB.")
+    list(APPEND CERES_COMPILE_OPTIONS CERES_USE_TBB)
+    include_directories(${TBB_INCLUDE_DIRS})
+    # TBB requires C++11
+    update_cache_variable(CXX11 ON)
+  else (TBB_FOUND)
+    message("-- Failed to find TBB, disabling.")
+    update_cache_variable(TBB OFF)
+  endif (TBB_FOUND)
+endif (TBB)
+
+if (NOT OPENMP AND NOT TBB)
+  message("-- Nor OpenMP neither TBB is used, disabling multithreading.")
+  list(APPEND CERES_COMPILE_OPTIONS CERES_NO_THREADS)
+else (NOT OPENMP AND NOT TBB)
+  if (UNIX)
+    # At least on Linux, we need pthreads to be enabled for mutex to
+    # compile.  This may not work on Windows or Android.
+    find_package(Threads REQUIRED)
+    list(APPEND CERES_COMPILE_OPTIONS CERES_HAVE_PTHREAD)
+    list(APPEND CERES_COMPILE_OPTIONS CERES_HAVE_RWLOCK)
+  endif (UNIX)
+endif (NOT OPENMP AND NOT TBB)
+
 include(CheckCXXCompilerFlag)
 check_cxx_compiler_flag("-std=c++11" COMPILER_HAS_CXX11_FLAG)
 if (CXX11 AND COMPILER_HAS_CXX11_FLAG)
diff --git a/cmake/FindTBB.cmake b/cmake/FindTBB.cmake
new file mode 100644
index 0000000..8d57c80
--- /dev/null
+++ b/cmake/FindTBB.cmake
@@ -0,0 +1,245 @@
+# The MIT License (MIT)
+#
+# Copyright (c) 2015 Justus Calvin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+#
+# FindTBB
+# -------
+#
+# Find TBB include directories and libraries.
+#
+# Usage:
+#
+#  find_package(TBB [major[.minor]] [EXACT]
+#               [QUIET] [REQUIRED]
+#               [[COMPONENTS] [components...]]
+#               [OPTIONAL_COMPONENTS components...])
+#
+# where the allowed components are tbbmalloc and tbb_preview. Users may modify
+# the behavior of this module with the following variables:
+#
+# * TBB_ROOT_DIR          - The base directory the of TBB installation.
+# * TBB_INCLUDE_DIR       - The directory that contains the TBB headers files.
+# * TBB_LIBRARY           - The directory that contains the TBB library files.
+# * TBB_<library>_LIBRARY - The path of the TBB the corresponding TBB library.
+#                           These libraries, if specified, override the
+#                           corresponding library search results, where <library>
+#                           may be tbb, tbb_debug, tbbmalloc, tbbmalloc_debug,
+#                           tbb_preview, or tbb_preview_debug.
+# * TBB_USE_DEBUG_BUILD   - The debug version of tbb libraries, if present, will
+#                           be used instead of the release version.
+#
+# Users may modify the behavior of this module with the following environment
+# variables:
+#
+# * TBB_INSTALL_DIR
+# * TBBROOT
+# * LIBRARY_PATH
+#
+# This module will set the following variables:
+#
+# * TBB_FOUND             - Set to false, or undefined, if we haven’t found, or
+#                           don’t want to use TBB.
+# * TBB_<component>_FOUND - If False, optional <component> part of TBB sytem is
+#                           not available.
+# * TBB_VERSION           - The full version string
+# * TBB_VERSION_MAJOR     - The major version
+# * TBB_VERSION_MINOR     - The minor version
+# * TBB_INTERFACE_VERSION - The interface version number defined in
+#                           tbb/tbb_stddef.h.
+# * TBB_<library>_LIBRARY_RELEASE - The path of the TBB release version of
+#                           <library>, where <library> may be tbb, tbb_debug,
+#                           tbbmalloc, tbbmalloc_debug, tbb_preview, or
+#                           tbb_preview_debug.
+# * TBB_<library>_LIBRARY_DEGUG - The path of the TBB release version of
+#                           <library>, where <library> may be tbb, tbb_debug,
+#                           tbbmalloc, tbbmalloc_debug, tbb_preview, or
+#                           tbb_preview_debug.
+#
+# The following varibles should be used to build and link with TBB:
+#
+# * TBB_INCLUDE_DIRS - The include directory for TBB.
+# * TBB_LIBRARIES    - The libraries to link against to use TBB.
+# * TBB_DEFINITIONS  - Definitions to use when compiling code that uses TBB.
+
+include(FindPackageHandleStandardArgs)
+
+if(NOT TBB_FOUND)
+
+  ##################################
+  # Check the build type
+  ##################################
+
+  if(NOT DEFINED TBB_USE_DEBUG_BUILD)
+    if(CMAKE_BUILD_TYPE MATCHES "[Debug|DEBUG|debug|RelWithDebInfo|RELWITHDEBINFO|relwithdebinfo]")
+      set(TBB_USE_DEBUG_BUILD TRUE)
+    else()
+      set(TBB_USE_DEBUG_BUILD FALSE)
+    endif()
+  endif()
+
+  ##################################
+  # Set the TBB search directories
+  ##################################
+
+  # Define search paths based on user input and environment variables
+  set(TBB_SEARCH_DIR ${TBB_ROOT_DIR} $ENV{TBB_INSTALL_DIR} $ENV{TBBROOT})
+
+  # Define the search directories based on the current platform
+  if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
+    set(TBB_DEFAULT_SEARCH_DIR "C:/Program Files/Intel/TBB"
+                               "C:/Program Files (x86)/Intel/TBB")
+
+    # Set the target architecture
+    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+      set(TBB_ARCHITECTURE "intel64")
+    else()
+      set(TBB_ARCHITECTURE "ia32")
+    endif()
+
+    # Set the TBB search library path search suffix based on the version of VC
+    if(WINDOWS_STORE)
+      set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc11_ui")
+    elseif(MSVC14)
+      set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc14")
+    elseif(MSVC12)
+      set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc12")
+    elseif(MSVC11)
+      set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc11")
+    elseif(MSVC10)
+      set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc10")
+    endif()
+
+    # Add the library path search suffix for the VC independent version of TBB
+    list(APPEND TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc_mt")
+
+  elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+    # OS X
+    set(TBB_DEFAULT_SEARCH_DIR "/opt/intel/tbb")
+
+    # TODO: Check to see which C++ library is being used by the compiler.
+    if(NOT ${CMAKE_SYSTEM_VERSION} VERSION_LESS 13.0)
+      # The default C++ library on OS X 10.9 and later is libc++
+      set(TBB_LIB_PATH_SUFFIX "lib/libc++")
+    else()
+      set(TBB_LIB_PATH_SUFFIX "lib")
+    endif()
+  elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    # Linux
+    set(TBB_DEFAULT_SEARCH_DIR "/opt/intel/tbb")
+
+    # TODO: Check compiler version to see the suffix should be <arch>/gcc4.1 or
+    #       <arch>/gcc4.1. For now, assume that the compiler is more recent than
+    #       gcc 4.4.x or later.
+    if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+      set(TBB_LIB_PATH_SUFFIX "lib/intel64/gcc4.4")
+    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$")
+      set(TBB_LIB_PATH_SUFFIX "lib/ia32/gcc4.4")
+    endif()
+  endif()
+
+  ##################################
+  # Find the TBB include dir
+  ##################################
+
+  find_path(TBB_INCLUDE_DIRS tbb/tbb.h
+      HINTS ${TBB_INCLUDE_DIR} ${TBB_SEARCH_DIR}
+      PATHS ${TBB_DEFAULT_SEARCH_DIR}
+      PATH_SUFFIXES include)
+
+  ##################################
+  # Find TBB components
+  ##################################
+
+  # Find each component
+  foreach(_comp tbb_preview tbbmalloc tbb)
+    # Search for the libraries
+    find_library(TBB_${_comp}_LIBRARY_RELEASE ${_comp}
+        HINTS ${TBB_LIBRARY} ${TBB_SEARCH_DIR}
+        PATHS ${TBB_DEFAULT_SEARCH_DIR}
+        PATH_SUFFIXES "${TBB_LIB_PATH_SUFFIX}")
+
+    find_library(TBB_${_comp}_LIBRARY_DEBUG ${_comp}_debug
+        HINTS ${TBB_LIBRARY} ${TBB_SEARCH_DIR}
+        PATHS ${TBB_DEFAULT_SEARCH_DIR} ENV LIBRARY_PATH
+        PATH_SUFFIXES "${TBB_LIB_PATH_SUFFIX}")
+
+
+    # Set the library to be used for the component
+    if(NOT TBB_${_comp}_LIBRARY)
+      if(TBB_USE_DEBUG_BUILD AND TBB_${_comp}_LIBRARY_DEBUG)
+        set(TBB_${_comp}_LIBRARY "${TBB_${_comp}_LIBRARY_DEBUG}")
+      elseif(TBB_${_comp}_LIBRARY_RELEASE)
+        set(TBB_${_comp}_LIBRARY "${TBB_${_comp}_LIBRARY_RELEASE}")
+      elseif(TBB_${_comp}_LIBRARY_DEBUG)
+        set(TBB_${_comp}_LIBRARY "${TBB_${_comp}_LIBRARY_DEBUG}")
+      endif()
+    endif()
+
+    # Set the TBB library list and component found variables
+    if(TBB_${_comp}_LIBRARY)
+      list(APPEND TBB_LIBRARIES "${TBB_${_comp}_LIBRARY}")
+      set(TBB_${_comp}_FOUND TRUE)
+    else()
+      set(TBB_${_comp}_FOUND FALSE)
+    endif()
+
+    mark_as_advanced(TBB_${_comp}_LIBRARY_RELEASE)
+    mark_as_advanced(TBB_${_comp}_LIBRARY_DEBUG)
+    mark_as_advanced(TBB_${_comp}_LIBRARY)
+
+  endforeach()
+
+  ##################################
+  # Set compile flags
+  ##################################
+
+  if(TBB_tbb_LIBRARY MATCHES "debug")
+    set(TBB_DEFINITIONS "-DTBB_USE_DEBUG=1")
+  endif()
+
+  ##################################
+  # Set version strings
+  ##################################
+
+  if(TBB_INCLUDE_DIRS)
+    file(READ "${TBB_INCLUDE_DIRS}/tbb/tbb_stddef.h" _tbb_version_file)
+    string(REGEX REPLACE ".*#define TBB_VERSION_MAJOR ([0-9]+).*" "\\1"
+            TBB_VERSION_MAJOR "${_tbb_version_file}")
+    string(REGEX REPLACE ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1"
+            TBB_VERSION_MINOR "${_tbb_version_file}")
+    string(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1"
+            TBB_INTERFACE_VERSION "${_tbb_version_file}")
+    set(TBB_VERSION "${TBB_VERSION_MAJOR}.${TBB_VERSION_MINOR}")
+  endif()
+
+  find_package_handle_standard_args(TBB
+      REQUIRED_VARS TBB_INCLUDE_DIRS TBB_LIBRARIES
+      HANDLE_COMPONENTS
+      VERSION_VAR TBB_VERSION)
+
+  mark_as_advanced(TBB_INCLUDE_DIRS TBB_LIBRARIES)
+
+  unset(TBB_ARCHITECTURE)
+  unset(TBB_LIB_PATH_SUFFIX)
+  unset(TBB_DEFAULT_SEARCH_DIR)
+
+endif()
diff --git a/cmake/config.h.in b/cmake/config.h.in
index 0bf0f8f..792e315 100644
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -67,8 +67,10 @@
 @CERES_NO_THREADS@
 // If defined Ceres was compiled with OpenMP multithreading support.
 @CERES_USE_OPENMP@
-// Additionally defined on *nix if Ceres was compiled with OpenMP support,
-// as in this case pthreads is also required.
+// If defined Ceres was compiled with TBB multithreading support.
+@CERES_USE_TBB@
+// Additionally defined on *nix if Ceres was compiled with OpenMP or TBB
+// support, as in this case pthreads is also required.
 @CERES_HAVE_PTHREAD@
 @CERES_HAVE_RWLOCK@
 
diff --git a/docs/source/features.rst b/docs/source/features.rst
index 1f5e91e..6a4b652 100644
--- a/docs/source/features.rst
+++ b/docs/source/features.rst
@@ -54,8 +54,8 @@
     of Non-linear Conjugate Gradients, BFGS and LBFGS.
 
 * **Speed** - Ceres Solver has been extensively optimized, with C++
-  templating, hand written linear algebra routines and OpenMP based
-  multithreading of the Jacobian evaluation and the linear solvers.
+  templating, hand written linear algebra routines and OpenMP or TBB
+  based multithreading of the Jacobian evaluation and the linear solvers.
 
 * **Solution Quality** Ceres is the `best performing`_ solver on the NIST
   problem set used by Mondragon and Borchers for benchmarking
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 3d34ee7..6c42ef0 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -89,6 +89,10 @@
   ``SuiteSparse``, and optionally used by Ceres directly for some
   operations.
 
+-  `TBB <https://www.threadingbuildingblocks.org/>`_ is a C++11 template
+  library for parallel programming that optionally can be used as an alternative
+  to OpenMP. **Optional**
+
   On ``UNIX`` OSes other than Mac OS X we recommend `ATLAS
   <http://math-atlas.sourceforge.net/>`_, which includes ``BLAS`` and
   ``LAPACK`` routines. It is also possible to use `OpenBLAS
@@ -611,9 +615,12 @@
    multi-threading with ``OpenMP`` is not supported. Turn this ``OFF``
    to disable multi-threading.
 
+#. ``TBB [Default: OFF]``: An alternative to ``OpenMP`` threading library that
+   requires C++11. This option is mutually exclusive to ``OpenMP``.
+
 #. ``CXX11 [Default: OFF]`` *Non-MSVC compilers only*.
 
-   Although Ceres does not currently use C++11, it does use
+   Although Ceres does not currently require C++11, it does use
    ``shared_ptr`` (required) and ``unordered_map`` (if available);
    both of which existed in the previous iterations of what became the
    C++11 standard: TR1 & C++0x.  As such, Ceres can compile on
diff --git a/docs/source/nnls_solving.rst b/docs/source/nnls_solving.rst
index fed0b3e..a699034 100644
--- a/docs/source/nnls_solving.rst
+++ b/docs/source/nnls_solving.rst
@@ -2111,8 +2111,8 @@
 
    Number of threads actually used by the solver for Jacobian and
    residual evaluation. This number is not equal to
-   :member:`Solver::Summary::num_threads_given` if `OpenMP` is not
-   available.
+   :member:`Solver::Summary::num_threads_given` if neither `OpenMP` nor `TBB`
+   is available.
 
 .. member:: int Solver::Summary::num_linear_solver_threads_given
 
@@ -2123,8 +2123,8 @@
 
    Number of threads actually used by the solver for solving the trust
    region problem. This number is not equal to
-   :member:`Solver::Summary::num_linear_solver_threads_given` if
-   `OpenMP` is not available.
+   :member:`Solver::Summary::num_linear_solver_threads_given` if neither
+   `OpenMP` nor `TBB` is available.
 
 .. member:: LinearSolverType Solver::Summary::linear_solver_type_given
 
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index c48472f..97db4f3 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -113,6 +113,7 @@
     split.cc
     stringprintf.cc
     suitesparse.cc
+    thread_token_provider.cc
     triplet_sparse_matrix.cc
     trust_region_preprocessor.cc
     trust_region_minimizer.cc
@@ -182,6 +183,13 @@
   endif()
 endif (OPENMP_FOUND)
 
+if (TBB_FOUND)
+  list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ${TBB_tbb_LIBRARY})
+  if (NOT MSVC)
+    list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ${CMAKE_THREAD_LIBS_INIT})
+  endif (NOT MSVC)
+endif (TBB_FOUND)
+
 set(CERES_LIBRARY_SOURCE
     ${CERES_INTERNAL_SRC}
     ${CERES_INTERNAL_HDRS}
diff --git a/internal/ceres/bundle_adjustment_test.cc b/internal/ceres/bundle_adjustment_test.cc
index 7cbf3f3..4209d6c 100644
--- a/internal/ceres/bundle_adjustment_test.cc
+++ b/internal/ceres/bundle_adjustment_test.cc
@@ -385,7 +385,7 @@
 }
 #endif  // CERES_USE_EIGEN_SPARSE
 
-#ifdef CERES_USE_OPENMP
+#ifndef CERES_NO_THREADS
 
 TEST_F(BundleAdjustmentTest, MultiThreadedDenseSchurWithAutomaticOrdering) {
   RunSolverForConfigAndExpectResidualsMatch(
@@ -555,7 +555,7 @@
       ThreadedSolverConfig(SPARSE_SCHUR, EIGEN_SPARSE, kUserOrdering));
 }
 #endif  // CERES_USE_EIGEN_SPARSE
-#endif  // CERES_USE_OPENMP
+#endif  // !CERES_NO_THREADS
 
 }  // namespace internal
 }  // namespace ceres
diff --git a/internal/ceres/coordinate_descent_minimizer.cc b/internal/ceres/coordinate_descent_minimizer.cc
index 83e0bd5..77e3bbf 100644
--- a/internal/ceres/coordinate_descent_minimizer.cc
+++ b/internal/ceres/coordinate_descent_minimizer.cc
@@ -30,8 +30,9 @@
 
 #include "ceres/coordinate_descent_minimizer.h"
 
-#ifdef CERES_USE_OPENMP
-#include <omp.h>
+#ifdef CERES_USE_TBB
+#include <tbb/parallel_for.h>
+#include <tbb/task_scheduler_init.h>
 #endif
 
 #include <iterator>
@@ -48,6 +49,8 @@
 #include "ceres/solver.h"
 #include "ceres/trust_region_minimizer.h"
 #include "ceres/trust_region_strategy.h"
+#include "ceres/thread_token_provider.h"
+#include "ceres/scoped_thread_token.h"
 
 namespace ceres {
 namespace internal {
@@ -148,30 +151,39 @@
   for (int i = 0; i < independent_set_offsets_.size() - 1; ++i) {
     const int num_problems =
         independent_set_offsets_[i + 1] - independent_set_offsets_[i];
-    // No point paying the price for an OpemMP call if the set is of
-    // size zero.
+    // Avoid parallelization overhead call if the set is empty.
     if (num_problems == 0) {
       continue;
     }
 
-#ifdef CERES_USE_OPENMP
+
     const int num_inner_iteration_threads =
         min(options.num_threads, num_problems);
     evaluator_options_.num_threads =
         max(1, options.num_threads / num_inner_iteration_threads);
 
+    ThreadTokenProvider thread_token_provider(num_inner_iteration_threads);
+
+#ifdef CERES_USE_OPENMP
     // The parameter blocks in each independent set can be optimized
     // in parallel, since they do not co-occur in any residual block.
 #pragma omp parallel for num_threads(num_inner_iteration_threads)
 #endif
+
+#ifndef CERES_USE_TBB
     for (int j = independent_set_offsets_[i];
          j < independent_set_offsets_[i + 1];
          ++j) {
-#ifdef CERES_USE_OPENMP
-      const int thread_id = omp_get_thread_num();
 #else
-      const int thread_id = 0;
-#endif
+    tbb::task_scheduler_init tbb_task_scheduler_init(
+        num_inner_iteration_threads);
+    tbb::parallel_for(independent_set_offsets_[i],
+                      independent_set_offsets_[i + 1],
+                      [&](int j) {
+#endif // !CERES_USE_TBB
+
+      const ScopedThreadToken scoped_thread_token(&thread_token_provider);
+      const int thread_id = scoped_thread_token.token();
 
       ParameterBlock* parameter_block = parameter_blocks_[j];
       const int old_index = parameter_block->index();
@@ -203,6 +215,9 @@
       parameter_block->SetState(parameters + parameter_block->state_offset());
       parameter_block->SetConstant();
     }
+#ifdef CERES_USE_TBB
+  );
+#endif
   }
 
   for (int i =  0; i < parameter_blocks_.size(); ++i) {
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc
index c16e477..81b3ba1 100644
--- a/internal/ceres/covariance_impl.cc
+++ b/internal/ceres/covariance_impl.cc
@@ -30,8 +30,9 @@
 
 #include "ceres/covariance_impl.h"
 
-#ifdef CERES_USE_OPENMP
-#include <omp.h>
+#ifdef CERES_USE_TBB
+#include <tbb/parallel_for.h>
+#include <tbb/task_scheduler_init.h>
 #endif
 
 #include <algorithm>
@@ -55,7 +56,9 @@
 #include "ceres/parameter_block.h"
 #include "ceres/problem_impl.h"
 #include "ceres/residual_block.h"
+#include "ceres/scoped_thread_token.h"
 #include "ceres/suitesparse.h"
+#include "ceres/thread_token_provider.h"
 #include "ceres/wall_time.h"
 #include "glog/logging.h"
 
@@ -75,10 +78,10 @@
     : options_(options),
       is_computed_(false),
       is_valid_(false) {
-#ifndef CERES_USE_OPENMP
+#ifdef CERES_NO_THREADS
   if (options_.num_threads > 1) {
     LOG(WARNING)
-        << "OpenMP support is not compiled into this binary; "
+        << "Neither OpenMP nor TBB support is compiled into this binary; "
         << "only options.num_threads = 1 is supported. Switching "
         << "to single threaded mode.";
     options_.num_threads = 1;
@@ -339,49 +342,68 @@
 
   bool success = true;
 
+  ThreadTokenProvider thread_token_provider(num_threads);
+
+#ifdef CERES_USE_OPENMP
 // The collapse() directive is only supported in OpenMP 3.0 and higher. OpenMP
 // 3.0 was released in May 2008 (hence the version number).
-#if _OPENMP >= 200805
-#  pragma omp parallel for num_threads(num_threads) schedule(dynamic) collapse(2)
-#else
-#  pragma omp parallel for num_threads(num_threads) schedule(dynamic)
-#endif
+#  if _OPENMP >= 200805
+#    pragma omp parallel for num_threads(num_threads) schedule(dynamic) collapse(2)
+#  else
+#    pragma omp parallel for num_threads(num_threads) schedule(dynamic)
+#  endif
   for (int i = 0; i < num_parameters; ++i) {
     for (int j = 0; j < num_parameters; ++j) {
       // The second loop can't start from j = i for compatibility with OpenMP
       // collapse command. The conditional serves as a workaround
-      if (j >= i) {
-        int covariance_row_idx = cum_parameter_size[i];
-        int covariance_col_idx = cum_parameter_size[j];
-        int size_i = parameter_sizes[i];
-        int size_j = parameter_sizes[j];
-#ifdef CERES_USE_OPENMP
-        const int thread_id = omp_get_thread_num();
-#else
-        const int thread_id = 0;
-#endif
-        double* covariance_block =
-            workspace.get() +
-            thread_id * max_covariance_block_size * max_covariance_block_size;
-        if (!GetCovarianceBlockInTangentOrAmbientSpace(
-                parameters[i], parameters[j], lift_covariance_to_ambient_space,
-                covariance_block)) {
-          success = false;
-        }
+      if (j < i) {
+        continue;
+      }
+#endif // CERES_USE_OPENMP
 
-        covariance.block(covariance_row_idx, covariance_col_idx,
-                         size_i, size_j) =
-            MatrixRef(covariance_block, size_i, size_j);
+#ifdef CERES_NO_THREADS
+  for (int i = 0; i < num_parameters; ++i) {
+    for (int j = i; j < num_parameters; ++j) {
+#endif // CERES_NO_THREADS
 
-        if (i != j) {
-          covariance.block(covariance_col_idx, covariance_row_idx,
-                           size_j, size_i) =
-              MatrixRef(covariance_block, size_i, size_j).transpose();
+#ifdef CERES_USE_TBB
+  tbb::task_scheduler_init tbb_task_scheduler_init(num_threads);
+  tbb::parallel_for(0, num_parameters, [&](int i) {
+    tbb::parallel_for(i, num_parameters, [&](int j) {
+#endif // CERES_USE_TBB
 
-        }
+      int covariance_row_idx = cum_parameter_size[i];
+      int covariance_col_idx = cum_parameter_size[j];
+      int size_i = parameter_sizes[i];
+      int size_j = parameter_sizes[j];
+      const ScopedThreadToken scoped_thread_token(&thread_token_provider);
+      const int thread_id = scoped_thread_token.token();
+      double* covariance_block =
+          workspace.get() +
+          thread_id * max_covariance_block_size * max_covariance_block_size;
+      if (!GetCovarianceBlockInTangentOrAmbientSpace(
+              parameters[i], parameters[j], lift_covariance_to_ambient_space,
+              covariance_block)) {
+        success = false;
+      }
+
+      covariance.block(covariance_row_idx, covariance_col_idx,
+                       size_i, size_j) =
+          MatrixRef(covariance_block, size_i, size_j);
+
+      if (i != j) {
+        covariance.block(covariance_col_idx, covariance_row_idx,
+                         size_j, size_i) =
+            MatrixRef(covariance_block, size_i, size_j).transpose();
+
       }
     }
+#ifdef CERES_USE_TBB
+    );
+  });
+#else
   }
+#endif // CERES_USE_TBB
   return success;
 }
 
@@ -696,33 +718,42 @@
   const int num_threads = options_.num_threads;
   scoped_array<double> workspace(new double[num_threads * num_cols]);
 
+  ThreadTokenProvider thread_token_provider(num_threads);
+
+#ifdef CERES_USE_OPENMP
 #pragma omp parallel for num_threads(num_threads) schedule(dynamic)
+#endif // CERES_USE_OPENMP
+
+#ifndef CERES_USE_TBB
   for (int r = 0; r < num_cols; ++r) {
+#else
+  tbb::task_scheduler_init tbb_task_scheduler_init(num_threads);
+  tbb::parallel_for(0, num_cols, [&](int r) {
+#endif // !CERES_USE_TBB
+
     const int row_begin = rows[r];
     const int row_end = rows[r + 1];
-    if (row_end == row_begin) {
-      continue;
-    }
+    if (row_end != row_begin) {
+      const ScopedThreadToken scoped_thread_token(&thread_token_provider);
+      const int thread_id = scoped_thread_token.token();
 
-#  ifdef CERES_USE_OPENMP
-    const int thread_id = omp_get_thread_num();
-#  else
-    const int thread_id = 0;
-#  endif
-
-    double* solution = workspace.get() + thread_id * num_cols;
-    SolveRTRWithSparseRHS<SuiteSparse_long>(
-        num_cols,
-        static_cast<SuiteSparse_long*>(R->i),
-        static_cast<SuiteSparse_long*>(R->p),
-        static_cast<double*>(R->x),
-        inverse_permutation[r],
-        solution);
-    for (int idx = row_begin; idx < row_end; ++idx) {
-     const int c = cols[idx];
-     values[idx] = solution[inverse_permutation[c]];
+      double* solution = workspace.get() + thread_id * num_cols;
+      SolveRTRWithSparseRHS<SuiteSparse_long>(
+          num_cols,
+          static_cast<SuiteSparse_long*>(R->i),
+          static_cast<SuiteSparse_long*>(R->p),
+          static_cast<double*>(R->x),
+          inverse_permutation[r],
+          solution);
+      for (int idx = row_begin; idx < row_end; ++idx) {
+        const int c = cols[idx];
+        values[idx] = solution[inverse_permutation[c]];
+      }
     }
   }
+#ifdef CERES_USE_TBB
+  );
+#endif // CERES_USE_TBB
 
   free(permutation);
   cholmod_l_free_sparse(&R, &cc);
@@ -888,37 +919,47 @@
   const int num_threads = options_.num_threads;
   scoped_array<double> workspace(new double[num_threads * num_cols]);
 
+  ThreadTokenProvider thread_token_provider(num_threads);
+
+#ifdef CERES_USE_OPENMP
 #pragma omp parallel for num_threads(num_threads) schedule(dynamic)
+#endif // CERES_USE_OPENMP
+
+#ifndef CERES_USE_TBB
   for (int r = 0; r < num_cols; ++r) {
+#else
+  tbb::task_scheduler_init tbb_task_scheduler_init(num_threads);
+  tbb::parallel_for(0, num_cols, [&](int r) {
+#endif // !CERES_USE_TBB
+
     const int row_begin = rows[r];
     const int row_end = rows[r + 1];
-    if (row_end == row_begin) {
-      continue;
-    }
+    if (row_end != row_begin) {
+      const ScopedThreadToken scoped_thread_token(&thread_token_provider);
+      const int thread_id = scoped_thread_token.token();
 
-#  ifdef CERES_USE_OPENMP
-    const int thread_id = omp_get_thread_num();
-#  else
-    const int thread_id = 0;
-#  endif
+      double* solution = workspace.get() + thread_id * num_cols;
+      SolveRTRWithSparseRHS<int>(
+          num_cols,
+          qr_solver.matrixR().innerIndexPtr(),
+          qr_solver.matrixR().outerIndexPtr(),
+          &qr_solver.matrixR().data().value(0),
+          inverse_permutation.indices().coeff(r),
+          solution);
 
-    double* solution = workspace.get() + thread_id * num_cols;
-    SolveRTRWithSparseRHS<int>(
-        num_cols,
-        qr_solver.matrixR().innerIndexPtr(),
-        qr_solver.matrixR().outerIndexPtr(),
-        &qr_solver.matrixR().data().value(0),
-        inverse_permutation.indices().coeff(r),
-        solution);
-
-    // Assign the values of the computed covariance using the
-    // inverse permutation used in the QR factorization.
-    for (int idx = row_begin; idx < row_end; ++idx) {
-     const int c = cols[idx];
-     values[idx] = solution[inverse_permutation.indices().coeff(c)];
+      // Assign the values of the computed covariance using the
+      // inverse permutation used in the QR factorization.
+      for (int idx = row_begin; idx < row_end; ++idx) {
+       const int c = cols[idx];
+       values[idx] = solution[inverse_permutation.indices().coeff(c)];
+      }
     }
   }
 
+#ifdef CERES_USE_TBB
+  );
+#endif // CERES_USE_TBB
+
   event_logger.AddEvent("Inverse");
 
   return true;
diff --git a/internal/ceres/preprocessor.cc b/internal/ceres/preprocessor.cc
index 4aba6a3..e9dc293 100644
--- a/internal/ceres/preprocessor.cc
+++ b/internal/ceres/preprocessor.cc
@@ -56,10 +56,10 @@
 }
 
 void ChangeNumThreadsIfNeeded(Solver::Options* options) {
-#ifndef CERES_USE_OPENMP
+#ifdef CERES_NO_THREADS
   if (options->num_threads > 1) {
     LOG(WARNING)
-        << "OpenMP support is not compiled into this binary; "
+        << "Neither OpenMP nor TBB support is compiled into this binary; "
         << "only options.num_threads = 1 is supported. Switching "
         << "to single threaded mode.";
     options->num_threads = 1;
@@ -69,12 +69,12 @@
   if (options->minimizer_type == TRUST_REGION &&
       options->num_linear_solver_threads > 1) {
     LOG(WARNING)
-        << "OpenMP support is not compiled into this binary; "
+        << "Neither OpenMP nor TBB support is compiled into this binary; "
         << "only options.num_linear_solver_threads=1 is supported. Switching "
         << "to single threaded mode.";
     options->num_linear_solver_threads = 1;
   }
-#endif  // CERES_USE_OPENMP
+#endif  // CERES_NO_THREADS
 }
 
 void SetupCommonMinimizerOptions(PreprocessedProblem* pp) {
diff --git a/internal/ceres/problem_impl.cc b/internal/ceres/problem_impl.cc
index 9861958..4088079 100644
--- a/internal/ceres/problem_impl.cc
+++ b/internal/ceres/problem_impl.cc
@@ -746,15 +746,15 @@
   // the Evaluator decides the storage for the Jacobian based on the
   // type of linear solver being used.
   evaluator_options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
-#ifndef CERES_USE_OPENMP
+#ifdef CERES_NO_THREADS
   LOG_IF(WARNING, evaluate_options.num_threads > 1)
-      << "OpenMP support is not compiled into this binary; "
+      << "Neither OpenMP nor TBB support is compiled into this binary; "
       << "only evaluate_options.num_threads = 1 is supported. Switching "
       << "to single threaded mode.";
   evaluator_options.num_threads = 1;
 #else
   evaluator_options.num_threads = evaluate_options.num_threads;
-#endif  // CERES_USE_OPENMP
+#endif  // CERES_NO_THREADS
 
   scoped_ptr<Evaluator> evaluator(
       new ProgramEvaluator<ScratchEvaluatePreparer,
diff --git a/internal/ceres/program_evaluator.h b/internal/ceres/program_evaluator.h
index 597ff18..6049d94 100644
--- a/internal/ceres/program_evaluator.h
+++ b/internal/ceres/program_evaluator.h
@@ -43,7 +43,7 @@
 // residual jacobians are written directly into their final position in the
 // block sparse matrix by the user's CostFunction; there is no copying.
 //
-// The evaluation is threaded with OpenMP.
+// The evaluation is threaded with OpenMP or TBB.
 //
 // The EvaluatePreparer and JacobianWriter interfaces are as follows:
 //
@@ -82,10 +82,6 @@
 // This include must come before any #ifndef check on Ceres compile options.
 #include "ceres/internal/port.h"
 
-#ifdef CERES_USE_OPENMP
-#include <omp.h>
-#endif
-
 #include <map>
 #include <string>
 #include <vector>
@@ -95,7 +91,16 @@
 #include "ceres/parameter_block.h"
 #include "ceres/program.h"
 #include "ceres/residual_block.h"
+#include "ceres/scoped_thread_token.h"
 #include "ceres/small_blas.h"
+#include "ceres/thread_token_provider.h"
+
+#ifdef CERES_USE_TBB
+#include <atomic>
+
+#include <tbb/parallel_for.h>
+#include <tbb/task_scheduler_init.h>
+#endif
 
 namespace ceres {
 namespace internal {
@@ -115,15 +120,15 @@
         jacobian_writer_(options, program),
         evaluate_preparers_(
             jacobian_writer_.CreateEvaluatePreparers(options.num_threads)) {
-#ifndef CERES_USE_OPENMP
+#ifdef CERES_NO_THREADS
     if (options_.num_threads > 1) {
       LOG(WARNING)
-          << "OpenMP support is not compiled into this binary; "
+          << "Neither OpenMP nor TBB support is compiled into this binary; "
           << "only options.num_threads = 1 is supported. Switching "
           << "to single threaded mode.";
       options_.num_threads = 1;
     }
-#endif
+#endif // CERES_NO_THREADS
 
     BuildResidualLayout(*program, &residual_layout_);
     evaluate_scratch_.reset(CreateEvaluatorScratch(*program,
@@ -169,24 +174,43 @@
       }
     }
 
+    const int num_residual_blocks = program_->NumResidualBlocks();
+
+    ThreadTokenProvider thread_token_provider(options_.num_threads);
+
+#ifdef CERES_USE_OPENMP
     // This bool is used to disable the loop if an error is encountered
     // without breaking out of it. The remaining loop iterations are still run,
     // but with an empty body, and so will finish quickly.
     bool abort = false;
-    int num_residual_blocks = program_->NumResidualBlocks();
 #pragma omp parallel for num_threads(options_.num_threads)
     for (int i = 0; i < num_residual_blocks; ++i) {
 // Disable the loop instead of breaking, as required by OpenMP.
 #pragma omp flush(abort)
+#endif // CERES_USE_OPENMP
+
+#ifdef CERES_NO_THREADS
+    bool abort = false;
+    for (int i = 0; i < num_residual_blocks; ++i) {
+#endif // CERES_NO_THREADS
+
+#ifdef CERES_USE_TBB
+    std::atomic_bool abort(false);
+    tbb::task_scheduler_init tbb_task_scheduler_init(options_.num_threads);
+    tbb::parallel_for(0, num_residual_blocks, [&](int i) {
+#endif // CERES_USE_TBB
+
       if (abort) {
+#ifndef CERES_USE_TBB
         continue;
+#else
+        return;
+#endif // !CERES_USE_TBB
       }
 
-#ifdef CERES_USE_OPENMP
-      const int thread_id = omp_get_thread_num();
-#else
-      const int thread_id = 0;
-#endif
+      const ScopedThreadToken scoped_thread_token(&thread_token_provider);
+      const int thread_id = scoped_thread_token.token();
+
       EvaluatePreparer* preparer = &evaluate_preparers_[thread_id];
       EvaluateScratch* scratch = &evaluate_scratch_[thread_id];
 
@@ -218,11 +242,18 @@
               block_jacobians,
               scratch->residual_block_evaluate_scratch.get())) {
         abort = true;
+#ifdef CERES_USE_OPENMP
 // This ensures that the OpenMP threads have a consistent view of 'abort'. Do
 // the flush inside the failure case so that there is usually only one
 // synchronization point per loop iteration instead of two.
 #pragma omp flush(abort)
+#endif // CERES_USE_OPENMP
+
+#ifndef CERES_USE_TBB
         continue;
+#else
+        return;
+#endif // !CERES_USE_TBB
       }
 
       scratch->cost += block_cost;
@@ -255,6 +286,9 @@
         }
       }
     }
+#ifdef CERES_USE_TBB
+    );
+#endif // CERES_USE_TBB
 
     if (!abort) {
       const int num_parameters = program_->NumEffectiveParameters();
diff --git a/internal/ceres/schur_eliminator.h b/internal/ceres/schur_eliminator.h
index a546228..667f384 100644
--- a/internal/ceres/schur_eliminator.h
+++ b/internal/ceres/schur_eliminator.h
@@ -296,7 +296,8 @@
                  const double* inverse_ete_g,
                  double* rhs);
 
-  void ChunkOuterProduct(const CompressedRowBlockStructure* bs,
+  void ChunkOuterProduct(int thread_id,
+                         const CompressedRowBlockStructure* bs,
                          const Matrix& inverse_eet,
                          const double* buffer,
                          const BufferLayoutType& buffer_layout,
diff --git a/internal/ceres/schur_eliminator_impl.h b/internal/ceres/schur_eliminator_impl.h
index 5091b93..01409fd 100644
--- a/internal/ceres/schur_eliminator_impl.h
+++ b/internal/ceres/schur_eliminator_impl.h
@@ -48,10 +48,6 @@
 // This include must come before any #ifndef check on Ceres compile options.
 #include "ceres/internal/port.h"
 
-#ifdef CERES_USE_OPENMP
-#include <omp.h>
-#endif
-
 #include <algorithm>
 #include <map>
 #include "ceres/block_random_access_matrix.h"
@@ -63,11 +59,18 @@
 #include "ceres/invert_psd_matrix.h"
 #include "ceres/map_util.h"
 #include "ceres/schur_eliminator.h"
+#include "ceres/scoped_thread_token.h"
 #include "ceres/small_blas.h"
 #include "ceres/stl_util.h"
+#include "ceres/thread_token_provider.h"
 #include "Eigen/Dense"
 #include "glog/logging.h"
 
+#ifdef CERES_USE_TBB
+#include <tbb/parallel_for.h>
+#include <tbb/task_scheduler_init.h>
+#endif
+
 namespace ceres {
 namespace internal {
 
@@ -188,8 +191,17 @@
 
   // Add the diagonal to the schur complement.
   if (D != NULL) {
+#ifdef CERES_USE_OPENMP
 #pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
+#endif // CERES_USE_OPENMP
+
+#ifndef CERES_USE_TBB
     for (int i = num_eliminate_blocks_; i < num_col_blocks; ++i) {
+#else
+    tbb::task_scheduler_init tbb_task_scheduler_init(num_threads_);
+    tbb::parallel_for(num_eliminate_blocks_, num_col_blocks, [&](int i) {
+#endif // !CERES_USE_TBB
+
       const int block_id = i - num_eliminate_blocks_;
       int r, c, row_stride, col_stride;
       CellInfo* cell_info = lhs->GetCell(block_id, block_id,
@@ -206,8 +218,14 @@
             += diag.array().square().matrix();
       }
     }
+#ifdef CERES_USE_TBB
+    );
+#endif // CERES_USE_TBB
   }
 
+  ThreadTokenProvider thread_token_provider(num_threads_);
+
+#ifdef CERES_USE_OPENMP
   // Eliminate y blocks one chunk at a time.  For each chunk, compute
   // the entries of the normal equations and the gradient vector block
   // corresponding to the y block and then apply Gaussian elimination
@@ -222,12 +240,18 @@
   // block. EliminateRowOuterProduct does the corresponding operation
   // for the lhs of the reduced linear system.
 #pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
+#endif // CERES_USE_OPENMP
+
+#ifndef CERES_USE_TBB
   for (int i = 0; i < chunks_.size(); ++i) {
-#ifdef CERES_USE_OPENMP
-    const int thread_id = omp_get_thread_num();
 #else
-    const int thread_id = 0;
-#endif
+  tbb::task_scheduler_init tbb_task_scheduler_init(num_threads_);
+  tbb::parallel_for(0, int(chunks_.size()), [&](int i) {
+#endif // !CERES_USE_TBB
+
+    const ScopedThreadToken scoped_thread_token(&thread_token_provider);
+    const int thread_id = scoped_thread_token.token();
+
     double* buffer = buffer_.get() + thread_id * buffer_size_;
     const Chunk& chunk = chunks_[i];
     const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
@@ -289,8 +313,12 @@
     UpdateRhs(chunk, A, b, chunk.start, inverse_ete_g.get(), rhs);
 
     // S -= F'E(E'E)^{-1}E'F
-    ChunkOuterProduct(bs, inverse_ete, buffer, chunk.buffer_layout, lhs);
+    ChunkOuterProduct(
+        thread_id, bs, inverse_ete, buffer, chunk.buffer_layout, lhs);
   }
+#ifdef CERES_USE_TBB
+    );
+#endif // CERES_USE_TBB
 
   // For rows with no e_blocks, the schur complement update reduces to
   // S += F'F.
@@ -306,8 +334,18 @@
                const double* z,
                double* y) {
   const CompressedRowBlockStructure* bs = A->block_structure();
+
+#ifdef CERES_USE_OPENMP
 #pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
+#endif // CERES_USE_OPENMP
+
+#ifndef CERES_USE_TBB
   for (int i = 0; i < chunks_.size(); ++i) {
+#else
+  tbb::task_scheduler_init tbb_task_scheduler_init(num_threads_);
+  tbb::parallel_for(0, int(chunks_.size()), [&](int i) {
+#endif // !CERES_USE_TBB
+
     const Chunk& chunk = chunks_[i];
     const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
     const int e_block_size = bs->cols[e_block_id].size;
@@ -363,6 +401,9 @@
     y_block = InvertPSDMatrix<kEBlockSize>(assume_full_rank_ete_, ete)
         * y_block;
   }
+#ifdef CERES_USE_TBB
+  );
+#endif // CERES_USE_TBB
 }
 
 // Update the rhs of the reduced linear system. Compute
@@ -496,7 +537,8 @@
 template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
 void
 SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
-ChunkOuterProduct(const CompressedRowBlockStructure* bs,
+ChunkOuterProduct(int thread_id,
+                  const CompressedRowBlockStructure* bs,
                   const Matrix& inverse_ete,
                   const double* buffer,
                   const BufferLayoutType& buffer_layout,
@@ -508,11 +550,6 @@
   const int e_block_size = inverse_ete.rows();
   BufferLayoutType::const_iterator it1 = buffer_layout.begin();
 
-#ifdef CERES_USE_OPENMP
-  const int thread_id = omp_get_thread_num();
-#else
-  const int thread_id = 0;
-#endif
   double* b1_transpose_inverse_ete =
       chunk_outer_product_buffer_.get() + thread_id * buffer_size_;
 
diff --git a/internal/ceres/scoped_thread_token.h b/internal/ceres/scoped_thread_token.h
new file mode 100644
index 0000000..c167397
--- /dev/null
+++ b/internal/ceres/scoped_thread_token.h
@@ -0,0 +1,61 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2017 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: yp@photonscore.de (Yury Prokazov)
+
+#ifndef CERES_INTERNAL_SCOPED_THREAD_TOKEN_H_
+#define CERES_INTERNAL_SCOPED_THREAD_TOKEN_H_
+
+#include "ceres/thread_token_provider.h"
+
+namespace ceres {
+namespace internal {
+
+// Helper class for ThreadTokenProvider. This object acquires a token in its
+// constructor and puts that token back with destruction.
+class ScopedThreadToken {
+ public:
+  ScopedThreadToken(ThreadTokenProvider* provider)
+      : provider_(provider), token_(provider->Acquire()) {}
+
+  ~ScopedThreadToken() { provider_->Release(token_); }
+
+  int token() const { return token_; }
+
+ private:
+  ThreadTokenProvider* provider_;
+  int token_;
+
+  ScopedThreadToken(ScopedThreadToken&);
+  ScopedThreadToken& operator=(ScopedThreadToken&);
+};
+
+}  // namespace internal
+}  // namespace ceres
+
+#endif  // CERES_INTERNAL_SCOPED_THREAD_TOKEN_H_
diff --git a/internal/ceres/thread_token_provider.cc b/internal/ceres/thread_token_provider.cc
new file mode 100644
index 0000000..5648d4d
--- /dev/null
+++ b/internal/ceres/thread_token_provider.cc
@@ -0,0 +1,74 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2017 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: yp@photonscore.de (Yury Prokazov)
+
+#include "ceres/thread_token_provider.h"
+
+#ifdef CERES_USE_OPENMP
+#include <omp.h>
+#endif
+
+namespace ceres {
+namespace internal {
+
+ThreadTokenProvider::ThreadTokenProvider(int num_threads) {
+  (void)num_threads;
+#ifdef CERES_USE_TBB
+  pool_.set_capacity(num_threads);
+  for (int i = 0; i < num_threads; i++) {
+    pool_.push(i);
+  }
+#endif
+}
+
+int ThreadTokenProvider::Acquire() {
+#ifdef CERES_USE_OPENMP
+  return omp_get_thread_num();
+#endif
+
+#ifdef CERES_NO_THREADS
+  return 0;
+#endif
+
+#ifdef CERES_USE_TBB
+  int thread_id;
+  pool_.pop(thread_id);
+  return thread_id;
+#endif
+}
+
+void ThreadTokenProvider::Release(int thread_id) {
+  (void)thread_id;
+#ifdef CERES_USE_TBB
+  pool_.push(thread_id);
+#endif
+}
+
+}  // namespace internal
+}  // namespace ceres
diff --git a/internal/ceres/thread_token_provider.h b/internal/ceres/thread_token_provider.h
new file mode 100644
index 0000000..209445d
--- /dev/null
+++ b/internal/ceres/thread_token_provider.h
@@ -0,0 +1,112 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2017 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: yp@photonscore.de (Yury Prokazov)
+
+#ifndef CERES_INTERNAL_THREAD_TOKEN_PROVIDER_H_
+#define CERES_INTERNAL_THREAD_TOKEN_PROVIDER_H_
+
+#include "ceres/internal/config.h"
+
+#if defined(CERES_USE_OPENMP)
+#  if defined(CERES_USE_TBB) || defined(CERES_NO_THREADS)
+#    error CERES_USE_OPENMP is mutually exclusive to CERES_USE_TBB and CERES_NO_THREADS
+#  endif
+#elif defined(CERES_USE_TBB)
+#  if defined(CERES_USE_OPENMP) || defined(CERES_NO_THREADS)
+#    error CERES_USE_TBB is mutually exclusive to CERES_USE_OPENMP and CERES_NO_THREADS
+#  endif
+#elif defined(CERES_NO_THREADS)
+#  if defined(CERES_USE_OPENMP) || defined(CERES_USE_TBB)
+#    error CERES_NO_THREADS is mutually exclusive to CERES_USE_OPENMP and CERES_USE_TBB
+#  endif
+#else
+#  error One of CERES_USE_OPENMP, CERES_USE_TBB or CERES_NO_THREADS must be defined.
+#endif
+
+#ifdef CERES_USE_TBB
+#include <tbb/concurrent_queue.h>
+#endif
+
+namespace ceres {
+namespace internal {
+
+// Helper for TBB thread number identification that is similar to
+// omp_get_thread_num() behaviour. This is necessary to support TBB threading
+// with a sequential thread id. This is used to access preallocated resources in
+// the parallelized code parts.  The sequence of tokens varies from 0 to
+// num_threads - 1 that can be acquired to identify the thread in a thread pool.
+//
+// If CERES_NO_THREADS is defined, Acquire() always returns 0 and Release()
+// takes no action.
+//
+// If CERES_USE_OPENMP, omp_get_thread_num() is used to Acquire() with no action
+// in Release()
+//
+//
+// Example usage pseudocode:
+//
+// ThreadTokenProvider ttp(N); // allocate N tokens
+// Spawn N threads {
+//    int token = ttp.Acquire(); // get unique token
+//    ...
+//    ... use token to access resources bound to the thread
+//    ...
+//    ttp.Release(token); // return token to the pool
+//  }
+//
+class ThreadTokenProvider {
+ public:
+  ThreadTokenProvider(int num_threads);
+
+  // Returns the first token from the queue. The acquired value must be
+  // given back by Release().
+  //
+  // With TBB this function will block if all the tokens are aquired by
+  // concurent threads.
+  int Acquire();
+
+  // Makes previously acquired token available for other threads.
+  void Release(int thread_id);
+
+ private:
+#ifdef CERES_USE_TBB
+  // This queue initially holds a sequence from 0..num_threads-1. Every
+  // Acquire() call the first number is removed from here. When the token is not
+  // needed anymore it shall be given back with corresponding Release() call.
+  tbb::concurrent_bounded_queue<int> pool_;
+#endif
+
+  ThreadTokenProvider(ThreadTokenProvider&);
+  ThreadTokenProvider& operator=(ThreadTokenProvider&);
+};
+
+}  // namespace internal
+}  // namespace ceres
+
+#endif  // CERES_INTERNAL_THREAD_TOKEN_PROVIDER_H_