Add TBB threading support.
There are platforms where OpenMP is not available. This
patch adds support for Intel Threading Building Blocks (TBB)
as an alternative threading backend.
Change-Id: I94497d7cba0c3cfaccfc992169236f17fe948ae9
diff --git a/CMakeLists.txt b/CMakeLists.txt
index be553d8..b289008 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,6 +104,8 @@
ON)
# Multithreading using OpenMP
option(OPENMP "Enable threaded solving in Ceres (requires OpenMP)" ON)
+# Multithreading using TBB
+option(TBB "Enable threaded solving in Ceres with TBB (requires TBB and C++11)" OFF)
# Enable the use of Eigen as a sparse linear algebra library for
# solving the nonlinear least squares problems. Enabling this
# option will result in an LGPL licensed version of Ceres Solver
@@ -190,6 +192,7 @@
update_cache_variable(CXSPARSE OFF)
update_cache_variable(GFLAGS OFF)
update_cache_variable(OPENMP OFF)
+ update_cache_variable(TBB OFF)
# Apple claims that the BLAS call dsyrk_ is a private API, and will not allow
# you to submit to the Apple Store if the symbol is present.
update_cache_variable(LAPACK OFF)
@@ -404,6 +407,14 @@
message("-- Disabling custom blas")
endif (NOT CUSTOM_BLAS)
+# OpenMP and TBB are mutually exclusive options. OpenMP is on by default thus
+# disable it if the user requested TBB.
+if (TBB AND OPENMP)
+ update_cache_variable(OPENMP OFF)
+ message("-- Intel TBB enabled; disabling OpenMP support (they are mutually "
+ "exclusive)")
+endif (TBB AND OPENMP)
+
if (OPENMP)
# Find quietly, as we can continue without OpenMP if it is not found.
find_package(OpenMP QUIET)
@@ -412,13 +423,6 @@
list(APPEND CERES_COMPILE_OPTIONS CERES_USE_OPENMP)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
- if (UNIX)
- # At least on Linux, we need pthreads to be enabled for mutex to
- # compile. This may not work on Windows or Android.
- find_package(Threads REQUIRED)
- list(APPEND CERES_COMPILE_OPTIONS CERES_HAVE_PTHREAD)
- list(APPEND CERES_COMPILE_OPTIONS CERES_HAVE_RWLOCK)
- endif (UNIX)
else (OPENMP_FOUND)
message("-- Failed to find OpenMP, disabling. This is expected on "
"Clang < 3.8, and at least Xcode <= 8. See Ceres documentation for "
@@ -427,8 +431,7 @@
list(APPEND CERES_COMPILE_OPTIONS CERES_NO_THREADS)
endif (OPENMP_FOUND)
else (OPENMP)
- message("-- Building without OpenMP (disabling multithreading).")
- list(APPEND CERES_COMPILE_OPTIONS CERES_NO_THREADS)
+ message("-- Building without OpenMP, disabling.")
endif (OPENMP)
# Initialise CMAKE_REQUIRED_FLAGS used by CheckCXXSourceCompiles with the
@@ -436,6 +439,33 @@
# they are used when discovering shared_ptr/unordered_map.
set(CMAKE_REQUIRED_FLAGS ${CMAKE_CXX_FLAGS})
+if (TBB)
+ find_package(TBB REQUIRED)
+ if (TBB_FOUND)
+ message("-- Building with TBB.")
+ list(APPEND CERES_COMPILE_OPTIONS CERES_USE_TBB)
+ include_directories(${TBB_INCLUDE_DIRS})
+ # TBB requires C++11
+ update_cache_variable(CXX11 ON)
+ else (TBB_FOUND)
+ message("-- Failed to find TBB, disabling.")
+ update_cache_variable(TBB OFF)
+ endif (TBB_FOUND)
+endif (TBB)
+
+if (NOT OPENMP AND NOT TBB)
+ message("-- Nor OpenMP neither TBB is used, disabling multithreading.")
+ list(APPEND CERES_COMPILE_OPTIONS CERES_NO_THREADS)
+else (NOT OPENMP AND NOT TBB)
+ if (UNIX)
+ # At least on Linux, we need pthreads to be enabled for mutex to
+ # compile. This may not work on Windows or Android.
+ find_package(Threads REQUIRED)
+ list(APPEND CERES_COMPILE_OPTIONS CERES_HAVE_PTHREAD)
+ list(APPEND CERES_COMPILE_OPTIONS CERES_HAVE_RWLOCK)
+ endif (UNIX)
+endif (NOT OPENMP AND NOT TBB)
+
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-std=c++11" COMPILER_HAS_CXX11_FLAG)
if (CXX11 AND COMPILER_HAS_CXX11_FLAG)
diff --git a/cmake/FindTBB.cmake b/cmake/FindTBB.cmake
new file mode 100644
index 0000000..8d57c80
--- /dev/null
+++ b/cmake/FindTBB.cmake
@@ -0,0 +1,245 @@
+# The MIT License (MIT)
+#
+# Copyright (c) 2015 Justus Calvin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+#
+# FindTBB
+# -------
+#
+# Find TBB include directories and libraries.
+#
+# Usage:
+#
+# find_package(TBB [major[.minor]] [EXACT]
+# [QUIET] [REQUIRED]
+# [[COMPONENTS] [components...]]
+# [OPTIONAL_COMPONENTS components...])
+#
+# where the allowed components are tbbmalloc and tbb_preview. Users may modify
+# the behavior of this module with the following variables:
+#
+# * TBB_ROOT_DIR - The base directory the of TBB installation.
+# * TBB_INCLUDE_DIR - The directory that contains the TBB headers files.
+# * TBB_LIBRARY - The directory that contains the TBB library files.
+# * TBB_<library>_LIBRARY - The path of the TBB the corresponding TBB library.
+# These libraries, if specified, override the
+# corresponding library search results, where <library>
+# may be tbb, tbb_debug, tbbmalloc, tbbmalloc_debug,
+# tbb_preview, or tbb_preview_debug.
+# * TBB_USE_DEBUG_BUILD - The debug version of tbb libraries, if present, will
+# be used instead of the release version.
+#
+# Users may modify the behavior of this module with the following environment
+# variables:
+#
+# * TBB_INSTALL_DIR
+# * TBBROOT
+# * LIBRARY_PATH
+#
+# This module will set the following variables:
+#
+# * TBB_FOUND - Set to false, or undefined, if we haven’t found, or
+# don’t want to use TBB.
+# * TBB_<component>_FOUND - If False, optional <component> part of TBB sytem is
+# not available.
+# * TBB_VERSION - The full version string
+# * TBB_VERSION_MAJOR - The major version
+# * TBB_VERSION_MINOR - The minor version
+# * TBB_INTERFACE_VERSION - The interface version number defined in
+# tbb/tbb_stddef.h.
+# * TBB_<library>_LIBRARY_RELEASE - The path of the TBB release version of
+# <library>, where <library> may be tbb, tbb_debug,
+# tbbmalloc, tbbmalloc_debug, tbb_preview, or
+# tbb_preview_debug.
+# * TBB_<library>_LIBRARY_DEGUG - The path of the TBB release version of
+# <library>, where <library> may be tbb, tbb_debug,
+# tbbmalloc, tbbmalloc_debug, tbb_preview, or
+# tbb_preview_debug.
+#
+# The following varibles should be used to build and link with TBB:
+#
+# * TBB_INCLUDE_DIRS - The include directory for TBB.
+# * TBB_LIBRARIES - The libraries to link against to use TBB.
+# * TBB_DEFINITIONS - Definitions to use when compiling code that uses TBB.
+
+include(FindPackageHandleStandardArgs)
+
+if(NOT TBB_FOUND)
+
+ ##################################
+ # Check the build type
+ ##################################
+
+ if(NOT DEFINED TBB_USE_DEBUG_BUILD)
+ if(CMAKE_BUILD_TYPE MATCHES "[Debug|DEBUG|debug|RelWithDebInfo|RELWITHDEBINFO|relwithdebinfo]")
+ set(TBB_USE_DEBUG_BUILD TRUE)
+ else()
+ set(TBB_USE_DEBUG_BUILD FALSE)
+ endif()
+ endif()
+
+ ##################################
+ # Set the TBB search directories
+ ##################################
+
+ # Define search paths based on user input and environment variables
+ set(TBB_SEARCH_DIR ${TBB_ROOT_DIR} $ENV{TBB_INSTALL_DIR} $ENV{TBBROOT})
+
+ # Define the search directories based on the current platform
+ if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
+ set(TBB_DEFAULT_SEARCH_DIR "C:/Program Files/Intel/TBB"
+ "C:/Program Files (x86)/Intel/TBB")
+
+ # Set the target architecture
+ if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+ set(TBB_ARCHITECTURE "intel64")
+ else()
+ set(TBB_ARCHITECTURE "ia32")
+ endif()
+
+ # Set the TBB search library path search suffix based on the version of VC
+ if(WINDOWS_STORE)
+ set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc11_ui")
+ elseif(MSVC14)
+ set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc14")
+ elseif(MSVC12)
+ set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc12")
+ elseif(MSVC11)
+ set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc11")
+ elseif(MSVC10)
+ set(TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc10")
+ endif()
+
+ # Add the library path search suffix for the VC independent version of TBB
+ list(APPEND TBB_LIB_PATH_SUFFIX "lib/${TBB_ARCHITECTURE}/vc_mt")
+
+ elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+ # OS X
+ set(TBB_DEFAULT_SEARCH_DIR "/opt/intel/tbb")
+
+ # TODO: Check to see which C++ library is being used by the compiler.
+ if(NOT ${CMAKE_SYSTEM_VERSION} VERSION_LESS 13.0)
+ # The default C++ library on OS X 10.9 and later is libc++
+ set(TBB_LIB_PATH_SUFFIX "lib/libc++")
+ else()
+ set(TBB_LIB_PATH_SUFFIX "lib")
+ endif()
+ elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ # Linux
+ set(TBB_DEFAULT_SEARCH_DIR "/opt/intel/tbb")
+
+ # TODO: Check compiler version to see the suffix should be <arch>/gcc4.1 or
+ # <arch>/gcc4.1. For now, assume that the compiler is more recent than
+ # gcc 4.4.x or later.
+ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ set(TBB_LIB_PATH_SUFFIX "lib/intel64/gcc4.4")
+ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$")
+ set(TBB_LIB_PATH_SUFFIX "lib/ia32/gcc4.4")
+ endif()
+ endif()
+
+ ##################################
+ # Find the TBB include dir
+ ##################################
+
+ find_path(TBB_INCLUDE_DIRS tbb/tbb.h
+ HINTS ${TBB_INCLUDE_DIR} ${TBB_SEARCH_DIR}
+ PATHS ${TBB_DEFAULT_SEARCH_DIR}
+ PATH_SUFFIXES include)
+
+ ##################################
+ # Find TBB components
+ ##################################
+
+ # Find each component
+ foreach(_comp tbb_preview tbbmalloc tbb)
+ # Search for the libraries
+ find_library(TBB_${_comp}_LIBRARY_RELEASE ${_comp}
+ HINTS ${TBB_LIBRARY} ${TBB_SEARCH_DIR}
+ PATHS ${TBB_DEFAULT_SEARCH_DIR}
+ PATH_SUFFIXES "${TBB_LIB_PATH_SUFFIX}")
+
+ find_library(TBB_${_comp}_LIBRARY_DEBUG ${_comp}_debug
+ HINTS ${TBB_LIBRARY} ${TBB_SEARCH_DIR}
+ PATHS ${TBB_DEFAULT_SEARCH_DIR} ENV LIBRARY_PATH
+ PATH_SUFFIXES "${TBB_LIB_PATH_SUFFIX}")
+
+
+ # Set the library to be used for the component
+ if(NOT TBB_${_comp}_LIBRARY)
+ if(TBB_USE_DEBUG_BUILD AND TBB_${_comp}_LIBRARY_DEBUG)
+ set(TBB_${_comp}_LIBRARY "${TBB_${_comp}_LIBRARY_DEBUG}")
+ elseif(TBB_${_comp}_LIBRARY_RELEASE)
+ set(TBB_${_comp}_LIBRARY "${TBB_${_comp}_LIBRARY_RELEASE}")
+ elseif(TBB_${_comp}_LIBRARY_DEBUG)
+ set(TBB_${_comp}_LIBRARY "${TBB_${_comp}_LIBRARY_DEBUG}")
+ endif()
+ endif()
+
+ # Set the TBB library list and component found variables
+ if(TBB_${_comp}_LIBRARY)
+ list(APPEND TBB_LIBRARIES "${TBB_${_comp}_LIBRARY}")
+ set(TBB_${_comp}_FOUND TRUE)
+ else()
+ set(TBB_${_comp}_FOUND FALSE)
+ endif()
+
+ mark_as_advanced(TBB_${_comp}_LIBRARY_RELEASE)
+ mark_as_advanced(TBB_${_comp}_LIBRARY_DEBUG)
+ mark_as_advanced(TBB_${_comp}_LIBRARY)
+
+ endforeach()
+
+ ##################################
+ # Set compile flags
+ ##################################
+
+ if(TBB_tbb_LIBRARY MATCHES "debug")
+ set(TBB_DEFINITIONS "-DTBB_USE_DEBUG=1")
+ endif()
+
+ ##################################
+ # Set version strings
+ ##################################
+
+ if(TBB_INCLUDE_DIRS)
+ file(READ "${TBB_INCLUDE_DIRS}/tbb/tbb_stddef.h" _tbb_version_file)
+ string(REGEX REPLACE ".*#define TBB_VERSION_MAJOR ([0-9]+).*" "\\1"
+ TBB_VERSION_MAJOR "${_tbb_version_file}")
+ string(REGEX REPLACE ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1"
+ TBB_VERSION_MINOR "${_tbb_version_file}")
+ string(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1"
+ TBB_INTERFACE_VERSION "${_tbb_version_file}")
+ set(TBB_VERSION "${TBB_VERSION_MAJOR}.${TBB_VERSION_MINOR}")
+ endif()
+
+ find_package_handle_standard_args(TBB
+ REQUIRED_VARS TBB_INCLUDE_DIRS TBB_LIBRARIES
+ HANDLE_COMPONENTS
+ VERSION_VAR TBB_VERSION)
+
+ mark_as_advanced(TBB_INCLUDE_DIRS TBB_LIBRARIES)
+
+ unset(TBB_ARCHITECTURE)
+ unset(TBB_LIB_PATH_SUFFIX)
+ unset(TBB_DEFAULT_SEARCH_DIR)
+
+endif()
diff --git a/cmake/config.h.in b/cmake/config.h.in
index 0bf0f8f..792e315 100644
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -67,8 +67,10 @@
@CERES_NO_THREADS@
// If defined Ceres was compiled with OpenMP multithreading support.
@CERES_USE_OPENMP@
-// Additionally defined on *nix if Ceres was compiled with OpenMP support,
-// as in this case pthreads is also required.
+// If defined Ceres was compiled with TBB multithreading support.
+@CERES_USE_TBB@
+// Additionally defined on *nix if Ceres was compiled with OpenMP or TBB
+// support, as in this case pthreads is also required.
@CERES_HAVE_PTHREAD@
@CERES_HAVE_RWLOCK@
diff --git a/docs/source/features.rst b/docs/source/features.rst
index 1f5e91e..6a4b652 100644
--- a/docs/source/features.rst
+++ b/docs/source/features.rst
@@ -54,8 +54,8 @@
of Non-linear Conjugate Gradients, BFGS and LBFGS.
* **Speed** - Ceres Solver has been extensively optimized, with C++
- templating, hand written linear algebra routines and OpenMP based
- multithreading of the Jacobian evaluation and the linear solvers.
+ templating, hand written linear algebra routines and OpenMP or TBB
+ based multithreading of the Jacobian evaluation and the linear solvers.
* **Solution Quality** Ceres is the `best performing`_ solver on the NIST
problem set used by Mondragon and Borchers for benchmarking
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 3d34ee7..6c42ef0 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -89,6 +89,10 @@
``SuiteSparse``, and optionally used by Ceres directly for some
operations.
+- `TBB <https://www.threadingbuildingblocks.org/>`_ is a C++11 template
+ library for parallel programming that optionally can be used as an alternative
+ to OpenMP. **Optional**
+
On ``UNIX`` OSes other than Mac OS X we recommend `ATLAS
<http://math-atlas.sourceforge.net/>`_, which includes ``BLAS`` and
``LAPACK`` routines. It is also possible to use `OpenBLAS
@@ -611,9 +615,12 @@
multi-threading with ``OpenMP`` is not supported. Turn this ``OFF``
to disable multi-threading.
+#. ``TBB [Default: OFF]``: An alternative to ``OpenMP`` threading library that
+ requires C++11. This option is mutually exclusive to ``OpenMP``.
+
#. ``CXX11 [Default: OFF]`` *Non-MSVC compilers only*.
- Although Ceres does not currently use C++11, it does use
+ Although Ceres does not currently require C++11, it does use
``shared_ptr`` (required) and ``unordered_map`` (if available);
both of which existed in the previous iterations of what became the
C++11 standard: TR1 & C++0x. As such, Ceres can compile on
diff --git a/docs/source/nnls_solving.rst b/docs/source/nnls_solving.rst
index fed0b3e..a699034 100644
--- a/docs/source/nnls_solving.rst
+++ b/docs/source/nnls_solving.rst
@@ -2111,8 +2111,8 @@
Number of threads actually used by the solver for Jacobian and
residual evaluation. This number is not equal to
- :member:`Solver::Summary::num_threads_given` if `OpenMP` is not
- available.
+ :member:`Solver::Summary::num_threads_given` if neither `OpenMP` nor `TBB`
+ is available.
.. member:: int Solver::Summary::num_linear_solver_threads_given
@@ -2123,8 +2123,8 @@
Number of threads actually used by the solver for solving the trust
region problem. This number is not equal to
- :member:`Solver::Summary::num_linear_solver_threads_given` if
- `OpenMP` is not available.
+ :member:`Solver::Summary::num_linear_solver_threads_given` if neither
+ `OpenMP` nor `TBB` is available.
.. member:: LinearSolverType Solver::Summary::linear_solver_type_given
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index c48472f..97db4f3 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -113,6 +113,7 @@
split.cc
stringprintf.cc
suitesparse.cc
+ thread_token_provider.cc
triplet_sparse_matrix.cc
trust_region_preprocessor.cc
trust_region_minimizer.cc
@@ -182,6 +183,13 @@
endif()
endif (OPENMP_FOUND)
+if (TBB_FOUND)
+ list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ${TBB_tbb_LIBRARY})
+ if (NOT MSVC)
+ list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ${CMAKE_THREAD_LIBS_INIT})
+ endif (NOT MSVC)
+endif (TBB_FOUND)
+
set(CERES_LIBRARY_SOURCE
${CERES_INTERNAL_SRC}
${CERES_INTERNAL_HDRS}
diff --git a/internal/ceres/bundle_adjustment_test.cc b/internal/ceres/bundle_adjustment_test.cc
index 7cbf3f3..4209d6c 100644
--- a/internal/ceres/bundle_adjustment_test.cc
+++ b/internal/ceres/bundle_adjustment_test.cc
@@ -385,7 +385,7 @@
}
#endif // CERES_USE_EIGEN_SPARSE
-#ifdef CERES_USE_OPENMP
+#ifndef CERES_NO_THREADS
TEST_F(BundleAdjustmentTest, MultiThreadedDenseSchurWithAutomaticOrdering) {
RunSolverForConfigAndExpectResidualsMatch(
@@ -555,7 +555,7 @@
ThreadedSolverConfig(SPARSE_SCHUR, EIGEN_SPARSE, kUserOrdering));
}
#endif // CERES_USE_EIGEN_SPARSE
-#endif // CERES_USE_OPENMP
+#endif // !CERES_NO_THREADS
} // namespace internal
} // namespace ceres
diff --git a/internal/ceres/coordinate_descent_minimizer.cc b/internal/ceres/coordinate_descent_minimizer.cc
index 83e0bd5..77e3bbf 100644
--- a/internal/ceres/coordinate_descent_minimizer.cc
+++ b/internal/ceres/coordinate_descent_minimizer.cc
@@ -30,8 +30,9 @@
#include "ceres/coordinate_descent_minimizer.h"
-#ifdef CERES_USE_OPENMP
-#include <omp.h>
+#ifdef CERES_USE_TBB
+#include <tbb/parallel_for.h>
+#include <tbb/task_scheduler_init.h>
#endif
#include <iterator>
@@ -48,6 +49,8 @@
#include "ceres/solver.h"
#include "ceres/trust_region_minimizer.h"
#include "ceres/trust_region_strategy.h"
+#include "ceres/thread_token_provider.h"
+#include "ceres/scoped_thread_token.h"
namespace ceres {
namespace internal {
@@ -148,30 +151,39 @@
for (int i = 0; i < independent_set_offsets_.size() - 1; ++i) {
const int num_problems =
independent_set_offsets_[i + 1] - independent_set_offsets_[i];
- // No point paying the price for an OpemMP call if the set is of
- // size zero.
+ // Avoid parallelization overhead call if the set is empty.
if (num_problems == 0) {
continue;
}
-#ifdef CERES_USE_OPENMP
+
const int num_inner_iteration_threads =
min(options.num_threads, num_problems);
evaluator_options_.num_threads =
max(1, options.num_threads / num_inner_iteration_threads);
+ ThreadTokenProvider thread_token_provider(num_inner_iteration_threads);
+
+#ifdef CERES_USE_OPENMP
// The parameter blocks in each independent set can be optimized
// in parallel, since they do not co-occur in any residual block.
#pragma omp parallel for num_threads(num_inner_iteration_threads)
#endif
+
+#ifndef CERES_USE_TBB
for (int j = independent_set_offsets_[i];
j < independent_set_offsets_[i + 1];
++j) {
-#ifdef CERES_USE_OPENMP
- const int thread_id = omp_get_thread_num();
#else
- const int thread_id = 0;
-#endif
+ tbb::task_scheduler_init tbb_task_scheduler_init(
+ num_inner_iteration_threads);
+ tbb::parallel_for(independent_set_offsets_[i],
+ independent_set_offsets_[i + 1],
+ [&](int j) {
+#endif // !CERES_USE_TBB
+
+ const ScopedThreadToken scoped_thread_token(&thread_token_provider);
+ const int thread_id = scoped_thread_token.token();
ParameterBlock* parameter_block = parameter_blocks_[j];
const int old_index = parameter_block->index();
@@ -203,6 +215,9 @@
parameter_block->SetState(parameters + parameter_block->state_offset());
parameter_block->SetConstant();
}
+#ifdef CERES_USE_TBB
+ );
+#endif
}
for (int i = 0; i < parameter_blocks_.size(); ++i) {
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc
index c16e477..81b3ba1 100644
--- a/internal/ceres/covariance_impl.cc
+++ b/internal/ceres/covariance_impl.cc
@@ -30,8 +30,9 @@
#include "ceres/covariance_impl.h"
-#ifdef CERES_USE_OPENMP
-#include <omp.h>
+#ifdef CERES_USE_TBB
+#include <tbb/parallel_for.h>
+#include <tbb/task_scheduler_init.h>
#endif
#include <algorithm>
@@ -55,7 +56,9 @@
#include "ceres/parameter_block.h"
#include "ceres/problem_impl.h"
#include "ceres/residual_block.h"
+#include "ceres/scoped_thread_token.h"
#include "ceres/suitesparse.h"
+#include "ceres/thread_token_provider.h"
#include "ceres/wall_time.h"
#include "glog/logging.h"
@@ -75,10 +78,10 @@
: options_(options),
is_computed_(false),
is_valid_(false) {
-#ifndef CERES_USE_OPENMP
+#ifdef CERES_NO_THREADS
if (options_.num_threads > 1) {
LOG(WARNING)
- << "OpenMP support is not compiled into this binary; "
+ << "Neither OpenMP nor TBB support is compiled into this binary; "
<< "only options.num_threads = 1 is supported. Switching "
<< "to single threaded mode.";
options_.num_threads = 1;
@@ -339,49 +342,68 @@
bool success = true;
+ ThreadTokenProvider thread_token_provider(num_threads);
+
+#ifdef CERES_USE_OPENMP
// The collapse() directive is only supported in OpenMP 3.0 and higher. OpenMP
// 3.0 was released in May 2008 (hence the version number).
-#if _OPENMP >= 200805
-# pragma omp parallel for num_threads(num_threads) schedule(dynamic) collapse(2)
-#else
-# pragma omp parallel for num_threads(num_threads) schedule(dynamic)
-#endif
+# if _OPENMP >= 200805
+# pragma omp parallel for num_threads(num_threads) schedule(dynamic) collapse(2)
+# else
+# pragma omp parallel for num_threads(num_threads) schedule(dynamic)
+# endif
for (int i = 0; i < num_parameters; ++i) {
for (int j = 0; j < num_parameters; ++j) {
// The second loop can't start from j = i for compatibility with OpenMP
// collapse command. The conditional serves as a workaround
- if (j >= i) {
- int covariance_row_idx = cum_parameter_size[i];
- int covariance_col_idx = cum_parameter_size[j];
- int size_i = parameter_sizes[i];
- int size_j = parameter_sizes[j];
-#ifdef CERES_USE_OPENMP
- const int thread_id = omp_get_thread_num();
-#else
- const int thread_id = 0;
-#endif
- double* covariance_block =
- workspace.get() +
- thread_id * max_covariance_block_size * max_covariance_block_size;
- if (!GetCovarianceBlockInTangentOrAmbientSpace(
- parameters[i], parameters[j], lift_covariance_to_ambient_space,
- covariance_block)) {
- success = false;
- }
+ if (j < i) {
+ continue;
+ }
+#endif // CERES_USE_OPENMP
- covariance.block(covariance_row_idx, covariance_col_idx,
- size_i, size_j) =
- MatrixRef(covariance_block, size_i, size_j);
+#ifdef CERES_NO_THREADS
+ for (int i = 0; i < num_parameters; ++i) {
+ for (int j = i; j < num_parameters; ++j) {
+#endif // CERES_NO_THREADS
- if (i != j) {
- covariance.block(covariance_col_idx, covariance_row_idx,
- size_j, size_i) =
- MatrixRef(covariance_block, size_i, size_j).transpose();
+#ifdef CERES_USE_TBB
+ tbb::task_scheduler_init tbb_task_scheduler_init(num_threads);
+ tbb::parallel_for(0, num_parameters, [&](int i) {
+ tbb::parallel_for(i, num_parameters, [&](int j) {
+#endif // CERES_USE_TBB
- }
+ int covariance_row_idx = cum_parameter_size[i];
+ int covariance_col_idx = cum_parameter_size[j];
+ int size_i = parameter_sizes[i];
+ int size_j = parameter_sizes[j];
+ const ScopedThreadToken scoped_thread_token(&thread_token_provider);
+ const int thread_id = scoped_thread_token.token();
+ double* covariance_block =
+ workspace.get() +
+ thread_id * max_covariance_block_size * max_covariance_block_size;
+ if (!GetCovarianceBlockInTangentOrAmbientSpace(
+ parameters[i], parameters[j], lift_covariance_to_ambient_space,
+ covariance_block)) {
+ success = false;
+ }
+
+ covariance.block(covariance_row_idx, covariance_col_idx,
+ size_i, size_j) =
+ MatrixRef(covariance_block, size_i, size_j);
+
+ if (i != j) {
+ covariance.block(covariance_col_idx, covariance_row_idx,
+ size_j, size_i) =
+ MatrixRef(covariance_block, size_i, size_j).transpose();
+
}
}
+#ifdef CERES_USE_TBB
+ );
+ });
+#else
}
+#endif // CERES_USE_TBB
return success;
}
@@ -696,33 +718,42 @@
const int num_threads = options_.num_threads;
scoped_array<double> workspace(new double[num_threads * num_cols]);
+ ThreadTokenProvider thread_token_provider(num_threads);
+
+#ifdef CERES_USE_OPENMP
#pragma omp parallel for num_threads(num_threads) schedule(dynamic)
+#endif // CERES_USE_OPENMP
+
+#ifndef CERES_USE_TBB
for (int r = 0; r < num_cols; ++r) {
+#else
+ tbb::task_scheduler_init tbb_task_scheduler_init(num_threads);
+ tbb::parallel_for(0, num_cols, [&](int r) {
+#endif // !CERES_USE_TBB
+
const int row_begin = rows[r];
const int row_end = rows[r + 1];
- if (row_end == row_begin) {
- continue;
- }
+ if (row_end != row_begin) {
+ const ScopedThreadToken scoped_thread_token(&thread_token_provider);
+ const int thread_id = scoped_thread_token.token();
-# ifdef CERES_USE_OPENMP
- const int thread_id = omp_get_thread_num();
-# else
- const int thread_id = 0;
-# endif
-
- double* solution = workspace.get() + thread_id * num_cols;
- SolveRTRWithSparseRHS<SuiteSparse_long>(
- num_cols,
- static_cast<SuiteSparse_long*>(R->i),
- static_cast<SuiteSparse_long*>(R->p),
- static_cast<double*>(R->x),
- inverse_permutation[r],
- solution);
- for (int idx = row_begin; idx < row_end; ++idx) {
- const int c = cols[idx];
- values[idx] = solution[inverse_permutation[c]];
+ double* solution = workspace.get() + thread_id * num_cols;
+ SolveRTRWithSparseRHS<SuiteSparse_long>(
+ num_cols,
+ static_cast<SuiteSparse_long*>(R->i),
+ static_cast<SuiteSparse_long*>(R->p),
+ static_cast<double*>(R->x),
+ inverse_permutation[r],
+ solution);
+ for (int idx = row_begin; idx < row_end; ++idx) {
+ const int c = cols[idx];
+ values[idx] = solution[inverse_permutation[c]];
+ }
}
}
+#ifdef CERES_USE_TBB
+ );
+#endif // CERES_USE_TBB
free(permutation);
cholmod_l_free_sparse(&R, &cc);
@@ -888,37 +919,47 @@
const int num_threads = options_.num_threads;
scoped_array<double> workspace(new double[num_threads * num_cols]);
+ ThreadTokenProvider thread_token_provider(num_threads);
+
+#ifdef CERES_USE_OPENMP
#pragma omp parallel for num_threads(num_threads) schedule(dynamic)
+#endif // CERES_USE_OPENMP
+
+#ifndef CERES_USE_TBB
for (int r = 0; r < num_cols; ++r) {
+#else
+ tbb::task_scheduler_init tbb_task_scheduler_init(num_threads);
+ tbb::parallel_for(0, num_cols, [&](int r) {
+#endif // !CERES_USE_TBB
+
const int row_begin = rows[r];
const int row_end = rows[r + 1];
- if (row_end == row_begin) {
- continue;
- }
+ if (row_end != row_begin) {
+ const ScopedThreadToken scoped_thread_token(&thread_token_provider);
+ const int thread_id = scoped_thread_token.token();
-# ifdef CERES_USE_OPENMP
- const int thread_id = omp_get_thread_num();
-# else
- const int thread_id = 0;
-# endif
+ double* solution = workspace.get() + thread_id * num_cols;
+ SolveRTRWithSparseRHS<int>(
+ num_cols,
+ qr_solver.matrixR().innerIndexPtr(),
+ qr_solver.matrixR().outerIndexPtr(),
+ &qr_solver.matrixR().data().value(0),
+ inverse_permutation.indices().coeff(r),
+ solution);
- double* solution = workspace.get() + thread_id * num_cols;
- SolveRTRWithSparseRHS<int>(
- num_cols,
- qr_solver.matrixR().innerIndexPtr(),
- qr_solver.matrixR().outerIndexPtr(),
- &qr_solver.matrixR().data().value(0),
- inverse_permutation.indices().coeff(r),
- solution);
-
- // Assign the values of the computed covariance using the
- // inverse permutation used in the QR factorization.
- for (int idx = row_begin; idx < row_end; ++idx) {
- const int c = cols[idx];
- values[idx] = solution[inverse_permutation.indices().coeff(c)];
+ // Assign the values of the computed covariance using the
+ // inverse permutation used in the QR factorization.
+ for (int idx = row_begin; idx < row_end; ++idx) {
+ const int c = cols[idx];
+ values[idx] = solution[inverse_permutation.indices().coeff(c)];
+ }
}
}
+#ifdef CERES_USE_TBB
+ );
+#endif // CERES_USE_TBB
+
event_logger.AddEvent("Inverse");
return true;
diff --git a/internal/ceres/preprocessor.cc b/internal/ceres/preprocessor.cc
index 4aba6a3..e9dc293 100644
--- a/internal/ceres/preprocessor.cc
+++ b/internal/ceres/preprocessor.cc
@@ -56,10 +56,10 @@
}
void ChangeNumThreadsIfNeeded(Solver::Options* options) {
-#ifndef CERES_USE_OPENMP
+#ifdef CERES_NO_THREADS
if (options->num_threads > 1) {
LOG(WARNING)
- << "OpenMP support is not compiled into this binary; "
+ << "Neither OpenMP nor TBB support is compiled into this binary; "
<< "only options.num_threads = 1 is supported. Switching "
<< "to single threaded mode.";
options->num_threads = 1;
@@ -69,12 +69,12 @@
if (options->minimizer_type == TRUST_REGION &&
options->num_linear_solver_threads > 1) {
LOG(WARNING)
- << "OpenMP support is not compiled into this binary; "
+ << "Neither OpenMP nor TBB support is compiled into this binary; "
<< "only options.num_linear_solver_threads=1 is supported. Switching "
<< "to single threaded mode.";
options->num_linear_solver_threads = 1;
}
-#endif // CERES_USE_OPENMP
+#endif // CERES_NO_THREADS
}
void SetupCommonMinimizerOptions(PreprocessedProblem* pp) {
diff --git a/internal/ceres/problem_impl.cc b/internal/ceres/problem_impl.cc
index 9861958..4088079 100644
--- a/internal/ceres/problem_impl.cc
+++ b/internal/ceres/problem_impl.cc
@@ -746,15 +746,15 @@
// the Evaluator decides the storage for the Jacobian based on the
// type of linear solver being used.
evaluator_options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
-#ifndef CERES_USE_OPENMP
+#ifdef CERES_NO_THREADS
LOG_IF(WARNING, evaluate_options.num_threads > 1)
- << "OpenMP support is not compiled into this binary; "
+ << "Neither OpenMP nor TBB support is compiled into this binary; "
<< "only evaluate_options.num_threads = 1 is supported. Switching "
<< "to single threaded mode.";
evaluator_options.num_threads = 1;
#else
evaluator_options.num_threads = evaluate_options.num_threads;
-#endif // CERES_USE_OPENMP
+#endif // CERES_NO_THREADS
scoped_ptr<Evaluator> evaluator(
new ProgramEvaluator<ScratchEvaluatePreparer,
diff --git a/internal/ceres/program_evaluator.h b/internal/ceres/program_evaluator.h
index 597ff18..6049d94 100644
--- a/internal/ceres/program_evaluator.h
+++ b/internal/ceres/program_evaluator.h
@@ -43,7 +43,7 @@
// residual jacobians are written directly into their final position in the
// block sparse matrix by the user's CostFunction; there is no copying.
//
-// The evaluation is threaded with OpenMP.
+// The evaluation is threaded with OpenMP or TBB.
//
// The EvaluatePreparer and JacobianWriter interfaces are as follows:
//
@@ -82,10 +82,6 @@
// This include must come before any #ifndef check on Ceres compile options.
#include "ceres/internal/port.h"
-#ifdef CERES_USE_OPENMP
-#include <omp.h>
-#endif
-
#include <map>
#include <string>
#include <vector>
@@ -95,7 +91,16 @@
#include "ceres/parameter_block.h"
#include "ceres/program.h"
#include "ceres/residual_block.h"
+#include "ceres/scoped_thread_token.h"
#include "ceres/small_blas.h"
+#include "ceres/thread_token_provider.h"
+
+#ifdef CERES_USE_TBB
+#include <atomic>
+
+#include <tbb/parallel_for.h>
+#include <tbb/task_scheduler_init.h>
+#endif
namespace ceres {
namespace internal {
@@ -115,15 +120,15 @@
jacobian_writer_(options, program),
evaluate_preparers_(
jacobian_writer_.CreateEvaluatePreparers(options.num_threads)) {
-#ifndef CERES_USE_OPENMP
+#ifdef CERES_NO_THREADS
if (options_.num_threads > 1) {
LOG(WARNING)
- << "OpenMP support is not compiled into this binary; "
+ << "Neither OpenMP nor TBB support is compiled into this binary; "
<< "only options.num_threads = 1 is supported. Switching "
<< "to single threaded mode.";
options_.num_threads = 1;
}
-#endif
+#endif // CERES_NO_THREADS
BuildResidualLayout(*program, &residual_layout_);
evaluate_scratch_.reset(CreateEvaluatorScratch(*program,
@@ -169,24 +174,43 @@
}
}
+ const int num_residual_blocks = program_->NumResidualBlocks();
+
+ ThreadTokenProvider thread_token_provider(options_.num_threads);
+
+#ifdef CERES_USE_OPENMP
// This bool is used to disable the loop if an error is encountered
// without breaking out of it. The remaining loop iterations are still run,
// but with an empty body, and so will finish quickly.
bool abort = false;
- int num_residual_blocks = program_->NumResidualBlocks();
#pragma omp parallel for num_threads(options_.num_threads)
for (int i = 0; i < num_residual_blocks; ++i) {
// Disable the loop instead of breaking, as required by OpenMP.
#pragma omp flush(abort)
+#endif // CERES_USE_OPENMP
+
+#ifdef CERES_NO_THREADS
+ bool abort = false;
+ for (int i = 0; i < num_residual_blocks; ++i) {
+#endif // CERES_NO_THREADS
+
+#ifdef CERES_USE_TBB
+ std::atomic_bool abort(false);
+ tbb::task_scheduler_init tbb_task_scheduler_init(options_.num_threads);
+ tbb::parallel_for(0, num_residual_blocks, [&](int i) {
+#endif // CERES_USE_TBB
+
if (abort) {
+#ifndef CERES_USE_TBB
continue;
+#else
+ return;
+#endif // !CERES_USE_TBB
}
-#ifdef CERES_USE_OPENMP
- const int thread_id = omp_get_thread_num();
-#else
- const int thread_id = 0;
-#endif
+ const ScopedThreadToken scoped_thread_token(&thread_token_provider);
+ const int thread_id = scoped_thread_token.token();
+
EvaluatePreparer* preparer = &evaluate_preparers_[thread_id];
EvaluateScratch* scratch = &evaluate_scratch_[thread_id];
@@ -218,11 +242,18 @@
block_jacobians,
scratch->residual_block_evaluate_scratch.get())) {
abort = true;
+#ifdef CERES_USE_OPENMP
// This ensures that the OpenMP threads have a consistent view of 'abort'. Do
// the flush inside the failure case so that there is usually only one
// synchronization point per loop iteration instead of two.
#pragma omp flush(abort)
+#endif // CERES_USE_OPENMP
+
+#ifndef CERES_USE_TBB
continue;
+#else
+ return;
+#endif // !CERES_USE_TBB
}
scratch->cost += block_cost;
@@ -255,6 +286,9 @@
}
}
}
+#ifdef CERES_USE_TBB
+ );
+#endif // CERES_USE_TBB
if (!abort) {
const int num_parameters = program_->NumEffectiveParameters();
diff --git a/internal/ceres/schur_eliminator.h b/internal/ceres/schur_eliminator.h
index a546228..667f384 100644
--- a/internal/ceres/schur_eliminator.h
+++ b/internal/ceres/schur_eliminator.h
@@ -296,7 +296,8 @@
const double* inverse_ete_g,
double* rhs);
- void ChunkOuterProduct(const CompressedRowBlockStructure* bs,
+ void ChunkOuterProduct(int thread_id,
+ const CompressedRowBlockStructure* bs,
const Matrix& inverse_eet,
const double* buffer,
const BufferLayoutType& buffer_layout,
diff --git a/internal/ceres/schur_eliminator_impl.h b/internal/ceres/schur_eliminator_impl.h
index 5091b93..01409fd 100644
--- a/internal/ceres/schur_eliminator_impl.h
+++ b/internal/ceres/schur_eliminator_impl.h
@@ -48,10 +48,6 @@
// This include must come before any #ifndef check on Ceres compile options.
#include "ceres/internal/port.h"
-#ifdef CERES_USE_OPENMP
-#include <omp.h>
-#endif
-
#include <algorithm>
#include <map>
#include "ceres/block_random_access_matrix.h"
@@ -63,11 +59,18 @@
#include "ceres/invert_psd_matrix.h"
#include "ceres/map_util.h"
#include "ceres/schur_eliminator.h"
+#include "ceres/scoped_thread_token.h"
#include "ceres/small_blas.h"
#include "ceres/stl_util.h"
+#include "ceres/thread_token_provider.h"
#include "Eigen/Dense"
#include "glog/logging.h"
+#ifdef CERES_USE_TBB
+#include <tbb/parallel_for.h>
+#include <tbb/task_scheduler_init.h>
+#endif
+
namespace ceres {
namespace internal {
@@ -188,8 +191,17 @@
// Add the diagonal to the schur complement.
if (D != NULL) {
+#ifdef CERES_USE_OPENMP
#pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
+#endif // CERES_USE_OPENMP
+
+#ifndef CERES_USE_TBB
for (int i = num_eliminate_blocks_; i < num_col_blocks; ++i) {
+#else
+ tbb::task_scheduler_init tbb_task_scheduler_init(num_threads_);
+ tbb::parallel_for(num_eliminate_blocks_, num_col_blocks, [&](int i) {
+#endif // !CERES_USE_TBB
+
const int block_id = i - num_eliminate_blocks_;
int r, c, row_stride, col_stride;
CellInfo* cell_info = lhs->GetCell(block_id, block_id,
@@ -206,8 +218,14 @@
+= diag.array().square().matrix();
}
}
+#ifdef CERES_USE_TBB
+ );
+#endif // CERES_USE_TBB
}
+ ThreadTokenProvider thread_token_provider(num_threads_);
+
+#ifdef CERES_USE_OPENMP
// Eliminate y blocks one chunk at a time. For each chunk, compute
// the entries of the normal equations and the gradient vector block
// corresponding to the y block and then apply Gaussian elimination
@@ -222,12 +240,18 @@
// block. EliminateRowOuterProduct does the corresponding operation
// for the lhs of the reduced linear system.
#pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
+#endif // CERES_USE_OPENMP
+
+#ifndef CERES_USE_TBB
for (int i = 0; i < chunks_.size(); ++i) {
-#ifdef CERES_USE_OPENMP
- const int thread_id = omp_get_thread_num();
#else
- const int thread_id = 0;
-#endif
+ tbb::task_scheduler_init tbb_task_scheduler_init(num_threads_);
+ tbb::parallel_for(0, int(chunks_.size()), [&](int i) {
+#endif // !CERES_USE_TBB
+
+ const ScopedThreadToken scoped_thread_token(&thread_token_provider);
+ const int thread_id = scoped_thread_token.token();
+
double* buffer = buffer_.get() + thread_id * buffer_size_;
const Chunk& chunk = chunks_[i];
const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
@@ -289,8 +313,12 @@
UpdateRhs(chunk, A, b, chunk.start, inverse_ete_g.get(), rhs);
// S -= F'E(E'E)^{-1}E'F
- ChunkOuterProduct(bs, inverse_ete, buffer, chunk.buffer_layout, lhs);
+ ChunkOuterProduct(
+ thread_id, bs, inverse_ete, buffer, chunk.buffer_layout, lhs);
}
+#ifdef CERES_USE_TBB
+ );
+#endif // CERES_USE_TBB
// For rows with no e_blocks, the schur complement update reduces to
// S += F'F.
@@ -306,8 +334,18 @@
const double* z,
double* y) {
const CompressedRowBlockStructure* bs = A->block_structure();
+
+#ifdef CERES_USE_OPENMP
#pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
+#endif // CERES_USE_OPENMP
+
+#ifndef CERES_USE_TBB
for (int i = 0; i < chunks_.size(); ++i) {
+#else
+ tbb::task_scheduler_init tbb_task_scheduler_init(num_threads_);
+ tbb::parallel_for(0, int(chunks_.size()), [&](int i) {
+#endif // !CERES_USE_TBB
+
const Chunk& chunk = chunks_[i];
const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
const int e_block_size = bs->cols[e_block_id].size;
@@ -363,6 +401,9 @@
y_block = InvertPSDMatrix<kEBlockSize>(assume_full_rank_ete_, ete)
* y_block;
}
+#ifdef CERES_USE_TBB
+ );
+#endif // CERES_USE_TBB
}
// Update the rhs of the reduced linear system. Compute
@@ -496,7 +537,8 @@
template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
void
SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
-ChunkOuterProduct(const CompressedRowBlockStructure* bs,
+ChunkOuterProduct(int thread_id,
+ const CompressedRowBlockStructure* bs,
const Matrix& inverse_ete,
const double* buffer,
const BufferLayoutType& buffer_layout,
@@ -508,11 +550,6 @@
const int e_block_size = inverse_ete.rows();
BufferLayoutType::const_iterator it1 = buffer_layout.begin();
-#ifdef CERES_USE_OPENMP
- const int thread_id = omp_get_thread_num();
-#else
- const int thread_id = 0;
-#endif
double* b1_transpose_inverse_ete =
chunk_outer_product_buffer_.get() + thread_id * buffer_size_;
diff --git a/internal/ceres/scoped_thread_token.h b/internal/ceres/scoped_thread_token.h
new file mode 100644
index 0000000..c167397
--- /dev/null
+++ b/internal/ceres/scoped_thread_token.h
@@ -0,0 +1,61 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2017 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: yp@photonscore.de (Yury Prokazov)
+
+#ifndef CERES_INTERNAL_SCOPED_THREAD_TOKEN_H_
+#define CERES_INTERNAL_SCOPED_THREAD_TOKEN_H_
+
+#include "ceres/thread_token_provider.h"
+
+namespace ceres {
+namespace internal {
+
+// Helper class for ThreadTokenProvider. This object acquires a token in its
+// constructor and puts that token back with destruction.
+class ScopedThreadToken {
+ public:
+ ScopedThreadToken(ThreadTokenProvider* provider)
+ : provider_(provider), token_(provider->Acquire()) {}
+
+ ~ScopedThreadToken() { provider_->Release(token_); }
+
+ int token() const { return token_; }
+
+ private:
+ ThreadTokenProvider* provider_;
+ int token_;
+
+ ScopedThreadToken(ScopedThreadToken&);
+ ScopedThreadToken& operator=(ScopedThreadToken&);
+};
+
+} // namespace internal
+} // namespace ceres
+
+#endif // CERES_INTERNAL_SCOPED_THREAD_TOKEN_H_
diff --git a/internal/ceres/thread_token_provider.cc b/internal/ceres/thread_token_provider.cc
new file mode 100644
index 0000000..5648d4d
--- /dev/null
+++ b/internal/ceres/thread_token_provider.cc
@@ -0,0 +1,74 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2017 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: yp@photonscore.de (Yury Prokazov)
+
+#include "ceres/thread_token_provider.h"
+
+#ifdef CERES_USE_OPENMP
+#include <omp.h>
+#endif
+
+namespace ceres {
+namespace internal {
+
+ThreadTokenProvider::ThreadTokenProvider(int num_threads) {
+ (void)num_threads;
+#ifdef CERES_USE_TBB
+ pool_.set_capacity(num_threads);
+ for (int i = 0; i < num_threads; i++) {
+ pool_.push(i);
+ }
+#endif
+}
+
+int ThreadTokenProvider::Acquire() {
+#ifdef CERES_USE_OPENMP
+ return omp_get_thread_num();
+#endif
+
+#ifdef CERES_NO_THREADS
+ return 0;
+#endif
+
+#ifdef CERES_USE_TBB
+ int thread_id;
+ pool_.pop(thread_id);
+ return thread_id;
+#endif
+}
+
+void ThreadTokenProvider::Release(int thread_id) {
+ (void)thread_id;
+#ifdef CERES_USE_TBB
+ pool_.push(thread_id);
+#endif
+}
+
+} // namespace internal
+} // namespace ceres
diff --git a/internal/ceres/thread_token_provider.h b/internal/ceres/thread_token_provider.h
new file mode 100644
index 0000000..209445d
--- /dev/null
+++ b/internal/ceres/thread_token_provider.h
@@ -0,0 +1,112 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2017 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: yp@photonscore.de (Yury Prokazov)
+
+#ifndef CERES_INTERNAL_THREAD_TOKEN_PROVIDER_H_
+#define CERES_INTERNAL_THREAD_TOKEN_PROVIDER_H_
+
+#include "ceres/internal/config.h"
+
+#if defined(CERES_USE_OPENMP)
+# if defined(CERES_USE_TBB) || defined(CERES_NO_THREADS)
+# error CERES_USE_OPENMP is mutually exclusive to CERES_USE_TBB and CERES_NO_THREADS
+# endif
+#elif defined(CERES_USE_TBB)
+# if defined(CERES_USE_OPENMP) || defined(CERES_NO_THREADS)
+# error CERES_USE_TBB is mutually exclusive to CERES_USE_OPENMP and CERES_NO_THREADS
+# endif
+#elif defined(CERES_NO_THREADS)
+# if defined(CERES_USE_OPENMP) || defined(CERES_USE_TBB)
+# error CERES_NO_THREADS is mutually exclusive to CERES_USE_OPENMP and CERES_USE_TBB
+# endif
+#else
+# error One of CERES_USE_OPENMP, CERES_USE_TBB or CERES_NO_THREADS must be defined.
+#endif
+
+#ifdef CERES_USE_TBB
+#include <tbb/concurrent_queue.h>
+#endif
+
+namespace ceres {
+namespace internal {
+
+// Helper for TBB thread number identification that is similar to
+// omp_get_thread_num() behaviour. This is necessary to support TBB threading
+// with a sequential thread id. This is used to access preallocated resources in
+// the parallelized code parts. The sequence of tokens varies from 0 to
+// num_threads - 1 that can be acquired to identify the thread in a thread pool.
+//
+// If CERES_NO_THREADS is defined, Acquire() always returns 0 and Release()
+// takes no action.
+//
+// If CERES_USE_OPENMP, omp_get_thread_num() is used to Acquire() with no action
+// in Release()
+//
+//
+// Example usage pseudocode:
+//
+// ThreadTokenProvider ttp(N); // allocate N tokens
+// Spawn N threads {
+// int token = ttp.Acquire(); // get unique token
+// ...
+// ... use token to access resources bound to the thread
+// ...
+// ttp.Release(token); // return token to the pool
+// }
+//
+class ThreadTokenProvider {
+ public:
+ ThreadTokenProvider(int num_threads);
+
+ // Returns the first token from the queue. The acquired value must be
+ // given back by Release().
+ //
+ // With TBB this function will block if all the tokens are aquired by
+ // concurent threads.
+ int Acquire();
+
+ // Makes previously acquired token available for other threads.
+ void Release(int thread_id);
+
+ private:
+#ifdef CERES_USE_TBB
+ // This queue initially holds a sequence from 0..num_threads-1. Every
+ // Acquire() call the first number is removed from here. When the token is not
+ // needed anymore it shall be given back with corresponding Release() call.
+ tbb::concurrent_bounded_queue<int> pool_;
+#endif
+
+ ThreadTokenProvider(ThreadTokenProvider&);
+ ThreadTokenProvider& operator=(ThreadTokenProvider&);
+};
+
+} // namespace internal
+} // namespace ceres
+
+#endif // CERES_INTERNAL_THREAD_TOKEN_PROVIDER_H_