Reorganize ParallelFor source files
Change-Id: Ic4941919e59210b48e447cbb61e539200c8c89df
diff --git a/bazel/ceres.bzl b/bazel/ceres.bzl
index a1117aa..dc92e0b 100644
--- a/bazel/ceres.bzl
+++ b/bazel/ceres.bzl
@@ -95,7 +95,8 @@
"manifold.cc",
"minimizer.cc",
"normal_prior.cc",
- "parallel_for_cxx.cc",
+ "parallel_for.cc",
+ "parallel_invoke.cc",
"parallel_utils.cc",
"parameter_block_ordering.cc",
"partitioned_matrix_view.cc",
@@ -205,7 +206,6 @@
"CERES_NO_EXPORT=",
"CERES_NO_LAPACK",
"CERES_NO_SUITESPARSE",
- "CERES_USE_CXX_THREADS",
"CERES_USE_EIGEN_SPARSE",
],
includes = [
diff --git a/docs/source/features.rst b/docs/source/features.rst
index d905415..ba22b9e 100644
--- a/docs/source/features.rst
+++ b/docs/source/features.rst
@@ -47,7 +47,7 @@
linear system. To this end Ceres ships with a variety of linear
solvers - dense QR and dense Cholesky factorization (using
`Eigen`_, `LAPACK`_ or `CUDA`_) for dense problems, sparse
- Cholesky factorization (`SuiteSparse`_, `Accelerate`_, `Eigen`_)
+ Cholesky factorization (`SuiteSparse`_, `Apple's Accelerate`_, `Eigen`_)
for large sparse problems, custom Schur complement based dense,
sparse, and iterative linear solvers for `bundle adjustment`_
problems.
@@ -59,9 +59,8 @@
of Non-linear Conjugate Gradients, BFGS and LBFGS.
* **Speed** - Ceres Solver has been extensively optimized, with C++
- templating, hand written linear algebra routines and OpenMP or
- modern C++ threads based multithreading of the Jacobian evaluation
- and the linear solvers.
+ templating, hand written linear algebra routines and modern C++ threads
+ based multithreading of the Jacobian evaluation and the linear solvers.
* **GPU Acceleration** If your system supports `CUDA`_ then Ceres
Solver can use the Nvidia GPU on your system to speed up the solver.
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index d9e338f..3838ccb 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -663,11 +663,6 @@
gains in the ``SPARSE_SCHUR`` solver, you can disable some of the
template specializations by turning this ``OFF``.
-#. ``CERES_THREADING_MODEL [Default: CXX_THREADS > OPENMP > NO_THREADS]``:
- Multi-threading backend Ceres should be compiled with. This will
- automatically be set to only accept the available subset of threading
- options in the CMake GUI.
-
#. ``BUILD_SHARED_LIBS [Default: OFF]``: By default Ceres is built as
a static library, turn this ``ON`` to instead build Ceres as a
shared library.
@@ -860,11 +855,6 @@
#. ``SchurSpecializations``: Ceres built with Schur specializations
(``SCHUR_SPECIALIZATIONS=ON``).
-#. ``OpenMP``: Ceres built with OpenMP (``CERES_THREADING_MODEL=OPENMP``).
-
-#. ``Multithreading``: Ceres built with *a* multithreading library.
- This is equivalent to (``CERES_THREAD != NO_THREADS``).
-
To specify one/multiple Ceres components use the ``COMPONENTS`` argument to
`find_package()
<http://www.cmake.org/cmake/help/v3.5/command/find_package.html>`_ like so:
diff --git a/docs/source/nnls_solving.rst b/docs/source/nnls_solving.rst
index e281b28..7f7340b 100644
--- a/docs/source/nnls_solving.rst
+++ b/docs/source/nnls_solving.rst
@@ -2298,9 +2298,7 @@
.. member:: int Solver::Summary::num_threads_used
Number of threads actually used by the solver for Jacobian and
- residual evaluation. This number is not equal to
- :member:`Solver::Summary::num_threads_given` if none of `OpenMP`
- or `CXX_THREADS` is available.
+ residual evaluation.
.. member:: LinearSolverType Solver::Summary::linear_solver_type_given
diff --git a/include/ceres/solver.h b/include/ceres/solver.h
index 09c6525..52249c5 100644
--- a/include/ceres/solver.h
+++ b/include/ceres/solver.h
@@ -991,8 +991,7 @@
int num_threads_given = -1;
// Number of threads actually used by the solver for Jacobian and
- // residual evaluation. This number is not equal to
- // num_threads_given if OpenMP is not available.
+ // residual evaluation.
int num_threads_used = -1;
// Type of the linear solver requested by the user.
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 23dbbd6..9729ef6 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -219,7 +219,8 @@
linear_solver.cc
low_rank_inverse_hessian.cc
minimizer.cc
- parallel_for_cxx.cc
+ parallel_for.cc
+ parallel_invoke.cc
parallel_utils.cc
parameter_block_ordering.cc
partitioned_matrix_view.cc
diff --git a/internal/ceres/block_random_access_dense_matrix.cc b/internal/ceres/block_random_access_dense_matrix.cc
index deb0a80..30d73f5 100644
--- a/internal/ceres/block_random_access_dense_matrix.cc
+++ b/internal/ceres/block_random_access_dense_matrix.cc
@@ -34,7 +34,7 @@
#include <vector>
#include "ceres/internal/eigen.h"
-#include "ceres/parallel_for.h"
+#include "ceres/parallel_vector_ops.h"
#include "glog/logging.h"
namespace ceres::internal {
diff --git a/internal/ceres/block_random_access_diagonal_matrix.cc b/internal/ceres/block_random_access_diagonal_matrix.cc
index fd109ba..daabf8b 100644
--- a/internal/ceres/block_random_access_diagonal_matrix.cc
+++ b/internal/ceres/block_random_access_diagonal_matrix.cc
@@ -40,6 +40,7 @@
#include "ceres/compressed_row_sparse_matrix.h"
#include "ceres/internal/export.h"
#include "ceres/parallel_for.h"
+#include "ceres/parallel_vector_ops.h"
#include "ceres/stl_util.h"
#include "ceres/types.h"
#include "glog/logging.h"
diff --git a/internal/ceres/block_random_access_sparse_matrix.cc b/internal/ceres/block_random_access_sparse_matrix.cc
index 36a3188..bd9f1ba 100644
--- a/internal/ceres/block_random_access_sparse_matrix.cc
+++ b/internal/ceres/block_random_access_sparse_matrix.cc
@@ -37,7 +37,7 @@
#include <vector>
#include "ceres/internal/export.h"
-#include "ceres/parallel_for.h"
+#include "ceres/parallel_vector_ops.h"
#include "ceres/triplet_sparse_matrix.h"
#include "ceres/types.h"
#include "glog/logging.h"
diff --git a/internal/ceres/block_sparse_matrix.cc b/internal/ceres/block_sparse_matrix.cc
index f49f229..a60ee89 100644
--- a/internal/ceres/block_sparse_matrix.cc
+++ b/internal/ceres/block_sparse_matrix.cc
@@ -40,6 +40,7 @@
#include "ceres/crs_matrix.h"
#include "ceres/internal/eigen.h"
#include "ceres/parallel_for.h"
+#include "ceres/parallel_vector_ops.h"
#include "ceres/small_blas.h"
#include "ceres/triplet_sparse_matrix.h"
#include "glog/logging.h"
diff --git a/internal/ceres/eigen_vector_ops.h b/internal/ceres/eigen_vector_ops.h
index 91f2946..8b519e6 100644
--- a/internal/ceres/eigen_vector_ops.h
+++ b/internal/ceres/eigen_vector_ops.h
@@ -35,6 +35,7 @@
#include "ceres/internal/eigen.h"
#include "ceres/parallel_for.h"
+#include "ceres/parallel_vector_ops.h"
namespace ceres::internal {
diff --git a/internal/ceres/implicit_schur_complement.cc b/internal/ceres/implicit_schur_complement.cc
index dbba3a7..9c73de6 100644
--- a/internal/ceres/implicit_schur_complement.cc
+++ b/internal/ceres/implicit_schur_complement.cc
@@ -36,6 +36,7 @@
#include "ceres/internal/eigen.h"
#include "ceres/linear_solver.h"
#include "ceres/parallel_for.h"
+#include "ceres/parallel_vector_ops.h"
#include "ceres/types.h"
#include "glog/logging.h"
diff --git a/internal/ceres/levenberg_marquardt_strategy.cc b/internal/ceres/levenberg_marquardt_strategy.cc
index 81ced2c..b37f74e 100644
--- a/internal/ceres/levenberg_marquardt_strategy.cc
+++ b/internal/ceres/levenberg_marquardt_strategy.cc
@@ -38,7 +38,7 @@
#include "ceres/internal/eigen.h"
#include "ceres/linear_least_squares_problems.h"
#include "ceres/linear_solver.h"
-#include "ceres/parallel_for.h"
+#include "ceres/parallel_vector_ops.h"
#include "ceres/sparse_matrix.h"
#include "ceres/trust_region_strategy.h"
#include "ceres/types.h"
diff --git a/internal/ceres/parallel_for_cxx.cc b/internal/ceres/parallel_for.cc
similarity index 71%
copy from internal/ceres/parallel_for_cxx.cc
copy to internal/ceres/parallel_for.cc
index 3c0671e..a92276d 100644
--- a/internal/ceres/parallel_for_cxx.cc
+++ b/internal/ceres/parallel_for.cc
@@ -28,6 +28,8 @@
//
// Author: vitus@google.com (Michael Vitus)
+#include "ceres/parallel_for.h"
+
#include <algorithm>
#include <atomic>
#include <cmath>
@@ -37,40 +39,11 @@
#include <tuple>
#include "ceres/internal/config.h"
-#include "ceres/parallel_for.h"
+#include "ceres/parallel_vector_ops.h"
#include "glog/logging.h"
namespace ceres::internal {
-BlockUntilFinished::BlockUntilFinished(int num_total_jobs)
- : num_total_jobs_finished_(0), num_total_jobs_(num_total_jobs) {}
-
-void BlockUntilFinished::Finished(int num_jobs_finished) {
- if (num_jobs_finished == 0) return;
- std::lock_guard<std::mutex> lock(mutex_);
- num_total_jobs_finished_ += num_jobs_finished;
- CHECK_LE(num_total_jobs_finished_, num_total_jobs_);
- if (num_total_jobs_finished_ == num_total_jobs_) {
- condition_.notify_one();
- }
-}
-
-void BlockUntilFinished::Block() {
- std::unique_lock<std::mutex> lock(mutex_);
- condition_.wait(
- lock, [&]() { return num_total_jobs_finished_ == num_total_jobs_; });
-}
-
-ThreadPoolState::ThreadPoolState(int start, int end, int num_work_blocks)
- : start(start),
- end(end),
- num_work_blocks(num_work_blocks),
- base_block_size((end - start) / num_work_blocks),
- num_base_p1_sized_blocks((end - start) % num_work_blocks),
- block_id(0),
- thread_id(0),
- block_until_finished(num_work_blocks) {}
-
int MaxNumThreadsAvailable() { return ThreadPool::MaxNumThreadsAvailable(); }
void ParallelSetZero(ContextImpl* context,
diff --git a/internal/ceres/parallel_for.h b/internal/ceres/parallel_for.h
index bf38603..7536c74 100644
--- a/internal/ceres/parallel_for.h
+++ b/internal/ceres/parallel_for.h
@@ -32,14 +32,14 @@
#ifndef CERES_INTERNAL_PARALLEL_FOR_H_
#define CERES_INTERNAL_PARALLEL_FOR_H_
-#include <algorithm>
-#include <functional>
#include <mutex>
+#include <vector>
#include "ceres/context_impl.h"
-#include "ceres/internal/disable_warnings.h"
#include "ceres/internal/eigen.h"
#include "ceres/internal/export.h"
+#include "ceres/parallel_invoke.h"
+#include "ceres/partition_range_for_parallel_for.h"
#include "glog/logging.h"
namespace ceres::internal {
@@ -51,237 +51,82 @@
: std::unique_lock<std::mutex>{m};
}
-// Returns the maximum number of threads supported by the threading backend
-// Ceres was compiled with.
-CERES_NO_EXPORT
-int MaxNumThreadsAvailable();
-
-// Parallel for implementations share a common set of routines in order
-// to enforce inlining of loop bodies, ensuring that single-threaded
-// performance is equivalent to a simple for loop
-namespace parallel_for_details {
-// Get arguments of callable as a tuple
-template <typename F, typename... Args>
-std::tuple<std::decay_t<Args>...> args_of(void (F::*)(Args...) const);
-
-template <typename F>
-using args_of_t = decltype(args_of(&F::operator()));
-
-// Parallelizable functions might require passing thread_id as the first
-// argument. This class supplies thread_id argument to functions that
-// support it and ignores it otherwise.
-template <typename F, typename Args>
-struct InvokeImpl;
-
-// For parallel for iterations of type [](int i) -> void
-template <typename F>
-struct InvokeImpl<F, std::tuple<int>> {
- static void InvokeOnSegment(int thread_id,
- int start,
- int end,
- const F& function) {
- (void)thread_id;
- for (int i = start; i < end; ++i) {
- function(i);
- }
- }
-};
-
-// For parallel for iterations of type [](int thread_id, int i) -> void
-template <typename F>
-struct InvokeImpl<F, std::tuple<int, int>> {
- static void InvokeOnSegment(int thread_id,
- int start,
- int end,
- const F& function) {
- for (int i = start; i < end; ++i) {
- function(thread_id, i);
- }
- }
-};
-
-// For parallel for iterations of type [](tuple<int, int> range) -> void
-template <typename F>
-struct InvokeImpl<F, std::tuple<std::tuple<int, int>>> {
- static void InvokeOnSegment(int thread_id,
- int start,
- int end,
- const F& function) {
- (void)thread_id;
- function(std::make_tuple(start, end));
- }
-};
-
-// For parallel for iterations of type [](int thread_id, tuple<int, int> range)
-// -> void
-template <typename F>
-struct InvokeImpl<F, std::tuple<int, std::tuple<int, int>>> {
- static void InvokeOnSegment(int thread_id,
- int start,
- int end,
- const F& function) {
- function(thread_id, std::make_tuple(start, end));
- }
-};
-
-// Invoke function passing thread_id only if required
-template <typename F>
-void InvokeOnSegment(int thread_id, int start, int end, const F& function) {
- InvokeImpl<F, args_of_t<F>>::InvokeOnSegment(thread_id, start, end, function);
-}
-
-// Check if it is possible to split range [start; end) into at most
-// max_num_partitions contiguous partitions of cost not greater than
-// max_partition_cost. Inclusive integer cumulative costs are provided by
-// cumulative_cost_data objects, with cumulative_cost_offset being a total cost
-// of all indices (starting from zero) preceding start element. Cumulative costs
-// are returned by cumulative_cost_fun called with a reference to
-// cumulative_cost_data element with index from range[start; end), and should be
-// non-decreasing. Partition of the range is returned via partition argument
-template <typename CumulativeCostData, typename CumulativeCostFun>
-bool MaxPartitionCostIsFeasible(int start,
- int end,
- int max_num_partitions,
- int max_partition_cost,
- int cumulative_cost_offset,
- const CumulativeCostData* cumulative_cost_data,
- const CumulativeCostFun& cumulative_cost_fun,
- std::vector<int>* partition) {
- partition->clear();
- partition->push_back(start);
- int partition_start = start;
- int cost_offset = cumulative_cost_offset;
-
- while (partition_start < end) {
- // Already have max_num_partitions
- if (partition->size() > max_num_partitions) {
- return false;
- }
- const int target = max_partition_cost + cost_offset;
- const int partition_end =
- std::partition_point(
- cumulative_cost_data + partition_start,
- cumulative_cost_data + end,
- [&cumulative_cost_fun, target](const CumulativeCostData& item) {
- return cumulative_cost_fun(item) <= target;
- }) -
- cumulative_cost_data;
- // Unable to make a partition from a single element
- if (partition_end == partition_start) {
- return false;
- }
-
- const int cost_last =
- cumulative_cost_fun(cumulative_cost_data[partition_end - 1]);
- partition->push_back(partition_end);
- partition_start = partition_end;
- cost_offset = cost_last;
- }
- return true;
-}
-
-// Split integer interval [start, end) into at most max_num_partitions
-// contiguous intervals, minimizing maximal total cost of a single interval.
-// Inclusive integer cumulative costs for each (zero-based) index are provided
-// by cumulative_cost_data objects, and are returned by cumulative_cost_fun call
-// with a reference to one of the objects from range [start, end)
-template <typename CumulativeCostData, typename CumulativeCostFun>
-std::vector<int> ComputePartition(
- int start,
- int end,
- int max_num_partitions,
- const CumulativeCostData* cumulative_cost_data,
- const CumulativeCostFun& cumulative_cost_fun) {
- // Given maximal partition cost, it is possible to verify if it is admissible
- // and obtain corresponding partition using MaxPartitionCostIsFeasible
- // function. In order to find the lowest admissible value, a binary search
- // over all potentially optimal cost values is being performed
- const int cumulative_cost_last =
- cumulative_cost_fun(cumulative_cost_data[end - 1]);
- const int cumulative_cost_offset =
- start ? cumulative_cost_fun(cumulative_cost_data[start - 1]) : 0;
- const int total_cost = cumulative_cost_last - cumulative_cost_offset;
-
- // Minimal maximal partition cost is not smaller than the average
- // We will use non-inclusive lower bound
- int partition_cost_lower_bound = total_cost / max_num_partitions - 1;
- // Minimal maximal partition cost is not larger than the total cost
- // Upper bound is inclusive
- int partition_cost_upper_bound = total_cost;
-
- std::vector<int> partition, partition_upper_bound;
- // Binary search over partition cost, returning the lowest admissible cost
- while (partition_cost_upper_bound - partition_cost_lower_bound > 1) {
- partition.reserve(max_num_partitions + 1);
- const int partition_cost =
- partition_cost_lower_bound +
- (partition_cost_upper_bound - partition_cost_lower_bound) / 2;
- bool admissible = MaxPartitionCostIsFeasible(start,
- end,
- max_num_partitions,
- partition_cost,
- cumulative_cost_offset,
- cumulative_cost_data,
- cumulative_cost_fun,
- &partition);
- if (admissible) {
- partition_cost_upper_bound = partition_cost;
- std::swap(partition, partition_upper_bound);
- } else {
- partition_cost_lower_bound = partition_cost;
- }
- }
-
- // After binary search over partition cost, interval
- // (partition_cost_lower_bound, partition_cost_upper_bound] contains the only
- // admissible partition cost value - partition_cost_upper_bound
- //
- // Partition for this cost value might have been already computed
- if (partition_upper_bound.empty() == false) {
- return partition_upper_bound;
- }
- // Partition for upper bound is not computed if and only if upper bound was
- // never updated This is a simple case of a single interval containing all
- // values, which we were not able to break into pieces
- partition = {start, end};
- return partition;
-}
-} // namespace parallel_for_details
-
-// Forward declaration of parallel invocation function that is to be
-// implemented by each threading backend
-template <typename F>
-void ParallelInvoke(ContextImpl* context,
- int start,
- int end,
- int num_threads,
- const F& function);
+// Returns the maximum supported number of threads
+CERES_NO_EXPORT int MaxNumThreadsAvailable();
// Execute the function for every element in the range [start, end) with at most
// num_threads. It will execute all the work on the calling thread if
// num_threads or (end - start) is equal to 1.
+// Depending on function signature, it will be supplied with either loop index
+// or a range of loop indicies; function can also be supplied with thread_id.
+// The following function signatures are supported:
+// - Functions accepting a single loop index:
+// - [](int index) { ... }
+// - [](int thread_id, int index) { ... }
+// - Functions accepting a range of loop index:
+// - [](std::tuple<int, int> index) { ... }
+// - [](int thread_id, std::tuple<int, int> index) { ... }
//
-// Functions with two arguments will be passed thread_id and loop index on each
-// invocation, functions with one argument will be invoked with loop index
+// When distributing workload between threads, it is assumed that each loop
+// iteration takes approximately equal time to complete.
template <typename F>
-void ParallelFor(ContextImpl* context,
- int start,
- int end,
- int num_threads,
- const F& function) {
- using namespace parallel_for_details;
+void ParallelFor(
+ ContextImpl* context, int start, int end, int num_threads, F&& function) {
CHECK_GT(num_threads, 0);
if (start >= end) {
return;
}
if (num_threads == 1 || end - start == 1) {
- InvokeOnSegment<F>(0, start, end, function);
+ InvokeOnSegment(0, std::make_tuple(start, end), std::forward<F>(function));
return;
}
CHECK(context != nullptr);
- ParallelInvoke<F>(context, start, end, num_threads, function);
+ ParallelInvoke(context, start, end, num_threads, std::forward<F>(function));
+}
+
+// Execute function for every element in the range [start, end) with at most
+// num_threads, using user-provided partitions array.
+// When distributing workload between threads, it is assumed that each segment
+// bounded by adjacent elements of partitions array takes approximately equal
+// time to process.
+template <typename F>
+void ParallelFor(ContextImpl* context,
+ int start,
+ int end,
+ int num_threads,
+ F&& function,
+ const std::vector<int>& partitions) {
+ CHECK_GT(num_threads, 0);
+ if (start >= end) {
+ return;
+ }
+ CHECK_EQ(partitions.front(), start);
+ CHECK_EQ(partitions.back(), end);
+ if (num_threads == 1 || end - start <= num_threads) {
+ ParallelFor(context, start, end, num_threads, std::forward<F>(function));
+ return;
+ }
+ CHECK_GT(partitions.size(), 1);
+ const int num_partitions = partitions.size() - 1;
+ ParallelFor(context,
+ 0,
+ num_partitions,
+ num_threads,
+ [&function, &partitions](int thread_id,
+ std::tuple<int, int> partition_ids) {
+ // partition_ids is a range of partition indices
+ const auto [partition_start, partition_end] = partition_ids;
+ // Execution over several adjacent segments is equivalent
+ // to execution over union of those segments (which is also a
+ // contiguous segment)
+ const int range_start = partitions[partition_start];
+ const int range_end = partitions[partition_end];
+ // Range of original loop indices
+ const auto range = std::make_tuple(range_start, range_end);
+ InvokeOnSegment(thread_id, range, function);
+ });
}
// Execute function for every element in the range [start, end) with at most
@@ -293,123 +138,43 @@
// user-defined object. Only indices from [start, end) will be referenced. This
// routine assumes that cumulative_cost_fun is non-decreasing (in other words,
// all costs are non-negative);
+// When distributing workload between threads, input range of loop indices will
+// be partitioned into disjoint contiguous intervals, with the maximal cost
+// being minimized.
+// For example, with iteration costs of [1, 1, 5, 3, 1, 4] cumulative_cost_fun
+// should return [1, 2, 7, 10, 11, 15], and with num_threads = 4 this range
+// will be split into segments [0, 2) [2, 3) [3, 5) [5, 6) with costs
+// [2, 5, 4, 4].
template <typename F, typename CumulativeCostData, typename CumulativeCostFun>
void ParallelFor(ContextImpl* context,
int start,
int end,
int num_threads,
- const F& function,
+ F&& function,
const CumulativeCostData* cumulative_cost_data,
- const CumulativeCostFun& cumulative_cost_fun) {
- using namespace parallel_for_details;
+ CumulativeCostFun&& cumulative_cost_fun) {
CHECK_GT(num_threads, 0);
if (start >= end) {
return;
}
if (num_threads == 1 || end - start <= num_threads) {
- ParallelFor(context, start, end, num_threads, function);
+ ParallelFor(context, start, end, num_threads, std::forward<F>(function));
return;
}
// Creating several partitions allows us to tolerate imperfections of
// partitioning and user-supplied iteration costs up to a certain extent
- const int kNumPartitionsPerThread = 4;
+ constexpr int kNumPartitionsPerThread = 4;
const int kMaxPartitions = num_threads * kNumPartitionsPerThread;
- const std::vector<int> partitions = ComputePartition(
- start, end, kMaxPartitions, cumulative_cost_data, cumulative_cost_fun);
+ const auto& partitions = PartitionRangeForParallelFor(
+ start,
+ end,
+ kMaxPartitions,
+ cumulative_cost_data,
+ std::forward<CumulativeCostFun>(cumulative_cost_fun));
CHECK_GT(partitions.size(), 1);
- const int num_partitions = partitions.size() - 1;
- ParallelFor(context,
- 0,
- num_partitions,
- num_threads,
- [&function, &partitions](int thread_id, int partition_id) {
- const int partition_start = partitions[partition_id];
- const int partition_end = partitions[partition_id + 1];
-
- InvokeOnSegment<F>(
- thread_id, partition_start, partition_end, function);
- });
+ ParallelFor(
+ context, start, end, num_threads, std::forward<F>(function), partitions);
}
-
-// Execute function for every element in the range [start, end) with at most
-// num_threads, using the user provided partitioning. taking into account
-// user-provided integer cumulative costs of iterations.
-template <typename F>
-void ParallelFor(ContextImpl* context,
- int start,
- int end,
- int num_threads,
- const F& function,
- const std::vector<int>& partitions) {
- using namespace parallel_for_details;
- CHECK_GT(num_threads, 0);
- if (start >= end) {
- return;
- }
- CHECK_EQ(partitions.front(), start);
- CHECK_EQ(partitions.back(), end);
- if (num_threads == 1 || end - start <= num_threads) {
- ParallelFor(context, start, end, num_threads, function);
- return;
- }
- CHECK_GT(partitions.size(), 1);
- const int num_partitions = partitions.size() - 1;
- ParallelFor(context,
- 0,
- num_partitions,
- num_threads,
- [&function, &partitions](int thread_id, int partition_id) {
- const int partition_start = partitions[partition_id];
- const int partition_end = partitions[partition_id + 1];
-
- InvokeOnSegment<F>(
- thread_id, partition_start, partition_end, function);
- });
-}
-
-// Evaluate vector expression in parallel
-// Assuming LhsExpression and RhsExpression are some sort of
-// column-vector expression, assignment lhs = rhs
-// is eavluated over a set of contiguous blocks in parallel.
-// This is expected to work well in the case of vector-based
-// expressions (since they typically do not result into
-// temporaries).
-// This method expects lhs to be size-compatible with rhs
-template <typename LhsExpression, typename RhsExpression>
-void ParallelAssign(ContextImpl* context,
- int num_threads,
- LhsExpression& lhs,
- const RhsExpression& rhs) {
- static_assert(LhsExpression::ColsAtCompileTime == 1);
- static_assert(RhsExpression::ColsAtCompileTime == 1);
- CHECK_EQ(lhs.rows(), rhs.rows());
- const int num_rows = lhs.rows();
- ParallelFor(context,
- 0,
- num_rows,
- num_threads,
- [&lhs, &rhs](const std::tuple<int, int>& range) {
- auto [start, end] = range;
- lhs.segment(start, end - start) =
- rhs.segment(start, end - start);
- });
-}
-
-// Set vector to zero using num_threads
-template <typename VectorType>
-void ParallelSetZero(ContextImpl* context,
- int num_threads,
- VectorType& vector) {
- ParallelSetZero(context, num_threads, vector.data(), vector.rows());
-}
-void ParallelSetZero(ContextImpl* context,
- int num_threads,
- double* values,
- int num_values);
} // namespace ceres::internal
-// Backend-specific implementations of ParallelInvoke
-#include "ceres/internal/disable_warnings.h"
-#include "ceres/parallel_for_cxx.h"
-
#endif // CERES_INTERNAL_PARALLEL_FOR_H_
diff --git a/internal/ceres/parallel_for_test.cc b/internal/ceres/parallel_for_test.cc
index 871bb52..b3c210f 100644
--- a/internal/ceres/parallel_for_test.cc
+++ b/internal/ceres/parallel_for_test.cc
@@ -41,6 +41,7 @@
#include "ceres/context_impl.h"
#include "ceres/internal/config.h"
+#include "ceres/parallel_vector_ops.h"
#include "glog/logging.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
@@ -190,8 +191,6 @@
// Basic test if MaxPartitionCostIsFeasible and BruteForcePartition agree on
// simple test-cases
TEST(GuidedParallelFor, MaxPartitionCostIsFeasible) {
- using parallel_for_details::MaxPartitionCostIsFeasible;
-
std::vector<int> costs, cumulative_costs, partition;
costs = {1, 2, 3, 5, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0};
cumulative_costs.resize(costs.size());
@@ -242,8 +241,6 @@
// Randomized tests for MaxPartitionCostIsFeasible
TEST(GuidedParallelFor, MaxPartitionCostIsFeasibleRandomized) {
- using parallel_for_details::MaxPartitionCostIsFeasible;
-
std::vector<int> costs, cumulative_costs, partition;
const auto dummy_getter = [](const int v) { return v; };
@@ -315,9 +312,7 @@
}
}
-TEST(GuidedParallelFor, ComputePartition) {
- using parallel_for_details::ComputePartition;
-
+TEST(GuidedParallelFor, PartitionRangeForParallelFor) {
std::vector<int> costs, cumulative_costs, partition;
const auto dummy_getter = [](const int v) { return v; };
@@ -359,8 +354,8 @@
}
}
EXPECT_TRUE(first_admissible != 0 || total == 0);
- partition =
- ComputePartition(start, end, M, cumulative_costs.data(), dummy_getter);
+ partition = PartitionRangeForParallelFor(
+ start, end, M, cumulative_costs.data(), dummy_getter);
ASSERT_GT(partition.size(), 1);
EXPECT_EQ(partition.front(), start);
EXPECT_EQ(partition.back(), end);
diff --git a/internal/ceres/parallel_for_cxx.cc b/internal/ceres/parallel_invoke.cc
similarity index 79%
rename from internal/ceres/parallel_for_cxx.cc
rename to internal/ceres/parallel_invoke.cc
index 3c0671e..0e387c5 100644
--- a/internal/ceres/parallel_for_cxx.cc
+++ b/internal/ceres/parallel_invoke.cc
@@ -1,5 +1,5 @@
// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
// http://ceres-solver.org/
//
// Redistribution and use in source and binary forms, with or without
@@ -38,6 +38,7 @@
#include "ceres/internal/config.h"
#include "ceres/parallel_for.h"
+#include "ceres/parallel_vector_ops.h"
#include "glog/logging.h"
namespace ceres::internal {
@@ -58,10 +59,12 @@
void BlockUntilFinished::Block() {
std::unique_lock<std::mutex> lock(mutex_);
condition_.wait(
- lock, [&]() { return num_total_jobs_finished_ == num_total_jobs_; });
+ lock, [this]() { return num_total_jobs_finished_ == num_total_jobs_; });
}
-ThreadPoolState::ThreadPoolState(int start, int end, int num_work_blocks)
+ParallelInvokeState::ParallelInvokeState(int start,
+ int end,
+ int num_work_blocks)
: start(start),
end(end),
num_work_blocks(num_work_blocks),
@@ -71,20 +74,4 @@
thread_id(0),
block_until_finished(num_work_blocks) {}
-int MaxNumThreadsAvailable() { return ThreadPool::MaxNumThreadsAvailable(); }
-
-void ParallelSetZero(ContextImpl* context,
- int num_threads,
- double* values,
- int num_values) {
- ParallelFor(context,
- 0,
- num_values,
- num_threads,
- [values](std::tuple<int, int> range) {
- auto [start, end] = range;
- std::fill(values + start, values + end, 0.);
- });
-}
-
} // namespace ceres::internal
diff --git a/internal/ceres/parallel_for_cxx.h b/internal/ceres/parallel_invoke.h
similarity index 85%
rename from internal/ceres/parallel_for_cxx.h
rename to internal/ceres/parallel_invoke.h
index fb87156..3e67e37 100644
--- a/internal/ceres/parallel_for_cxx.h
+++ b/internal/ceres/parallel_invoke.h
@@ -1,5 +1,5 @@
// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
+// Copyright 2023 Google Inc. All rights reserved.
// http://ceres-solver.org/
//
// Redistribution and use in source and binary forms, with or without
@@ -29,19 +29,47 @@
// Authors: vitus@google.com (Michael Vitus),
// dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
-#ifndef CERES_INTERNAL_PARALLEL_FOR_CXX_H_
-#define CERES_INTERNAL_PARALLEL_FOR_CXX_H_
+#ifndef CERES_INTERNAL_PARALLEL_INVOKE_H_
+#define CERES_INTERNAL_PARALLEL_INVOKE_H_
#include <atomic>
-#include <cmath>
#include <condition_variable>
#include <memory>
#include <mutex>
-
-#include "ceres/internal/config.h"
-#include "glog/logging.h"
+#include <tuple>
+#include <type_traits>
namespace ceres::internal {
+
+// InvokeWithThreadId handles passing thread_id to the function
+template <typename F, typename... Args>
+void InvokeWithThreadId(int thread_id, F&& function, Args&&... args) {
+ constexpr bool kPassThreadId = std::is_invocable_v<F, int, Args...>;
+
+ if constexpr (kPassThreadId) {
+ function(thread_id, std::forward<Args>(args)...);
+ } else {
+ function(std::forward<Args>(args)...);
+ }
+}
+
+// InvokeOnSegment either runs a loop over segment indices or passes it to the
+// function
+template <typename F>
+void InvokeOnSegment(int thread_id, std::tuple<int, int> range, F&& function) {
+ constexpr bool kExplicitLoop =
+ std::is_invocable_v<F, int> || std::is_invocable_v<F, int, int>;
+
+ if constexpr (kExplicitLoop) {
+ const auto [start, end] = range;
+ for (int i = start; i != end; ++i) {
+ InvokeWithThreadId(thread_id, std::forward<F>(function), i);
+ }
+ } else {
+ InvokeWithThreadId(thread_id, std::forward<F>(function), range);
+ }
+}
+
// This class creates a thread safe barrier which will block until a
// pre-specified number of threads call Finished. This allows us to block the
// main thread until all the parallel threads are finished processing all the
@@ -62,18 +90,17 @@
std::mutex mutex_;
std::condition_variable condition_;
int num_total_jobs_finished_;
- int num_total_jobs_;
+ const int num_total_jobs_;
};
// Shared state between the parallel tasks. Each thread will use this
// information to get the next block of work to be performed.
-struct ThreadPoolState {
+struct ParallelInvokeState {
// The entire range [start, end) is split into num_work_blocks contiguous
// disjoint intervals (blocks), which are as equal as possible given
// total index count and requested number of blocks.
//
- // Those num_work_blocks blocks are then processed by num_workers
- // workers
+ // Those num_work_blocks blocks are then processed in parallel.
//
// Total number of integer indices in interval [start, end) is
// end - start, and when splitting them into num_work_blocks blocks
@@ -87,7 +114,7 @@
// Note that this splitting is optimal in the sense of maximal difference
// between block sizes, since splitting into equal blocks is possible
// if and only if number of indices is divisible by number of blocks.
- ThreadPoolState(int start, int end, int num_work_blocks);
+ ParallelInvokeState(int start, int end, int num_work_blocks);
// The start and end index of the for loop.
const int start;
@@ -135,12 +162,8 @@
// A performance analysis has shown this implementation is on par with OpenMP
// and TBB.
template <typename F>
-void ParallelInvoke(ContextImpl* context,
- int start,
- int end,
- int num_threads,
- const F& function) {
- using namespace parallel_for_details;
+void ParallelInvoke(
+ ContextImpl* context, int start, int end, int num_threads, F&& function) {
CHECK(context != nullptr);
// Maximal number of work items scheduled for a single thread
@@ -159,8 +182,8 @@
// We use a std::shared_ptr because the main thread can finish all
// the work before the tasks have been popped off the queue. So the
// shared state needs to exist for the duration of all the tasks.
- std::shared_ptr<ThreadPoolState> shared_state(
- new ThreadPoolState(start, end, num_work_blocks));
+ auto shared_state =
+ std::make_shared<ParallelInvokeState>(start, end, num_work_blocks);
// A function which tries to perform several chunks of work.
auto task = [shared_state, num_threads, &function]() {
@@ -213,7 +236,8 @@
const int curr_end = curr_start + base_block_size +
(block_id < num_base_p1_sized_blocks ? 1 : 0);
// Perform each task in current block
- InvokeOnSegment<F>(thread_id, curr_start, curr_end, function);
+ const auto range = std::make_tuple(curr_start, curr_end);
+ InvokeOnSegment(thread_id, range, function);
}
shared_state->block_until_finished.Finished(num_jobs_finished);
};
@@ -239,4 +263,4 @@
} // namespace ceres::internal
-#endif // CERES_INTERNAL_PARALLEL_FOR_CXX_H_
+#endif
diff --git a/internal/ceres/parallel_vector_ops.h b/internal/ceres/parallel_vector_ops.h
new file mode 100644
index 0000000..20d6dbc
--- /dev/null
+++ b/internal/ceres/parallel_vector_ops.h
@@ -0,0 +1,87 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: vitus@google.com (Michael Vitus),
+// dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#ifndef CERES_INTERNAL_PARALLEL_VECTOR_OPS_H_
+#define CERES_INTERNAL_PARALLEL_VECTOR_OPS_H_
+
+#include <mutex>
+#include <vector>
+
+#include "ceres/context_impl.h"
+#include "ceres/internal/eigen.h"
+#include "ceres/internal/export.h"
+#include "ceres/parallel_for.h"
+
+namespace ceres::internal {
+
+// Evaluate vector expression in parallel
+// Assuming LhsExpression and RhsExpression are some sort of
+// column-vector expression, assignment lhs = rhs
+// is eavluated over a set of contiguous blocks in parallel.
+// This is expected to work well in the case of vector-based
+// expressions (since they typically do not result into
+// temporaries).
+// This method expects lhs to be size-compatible with rhs
+template <typename LhsExpression, typename RhsExpression>
+void ParallelAssign(ContextImpl* context,
+ int num_threads,
+ LhsExpression& lhs,
+ const RhsExpression& rhs) {
+ static_assert(LhsExpression::ColsAtCompileTime == 1);
+ static_assert(RhsExpression::ColsAtCompileTime == 1);
+ CHECK_EQ(lhs.rows(), rhs.rows());
+ const int num_rows = lhs.rows();
+ ParallelFor(context,
+ 0,
+ num_rows,
+ num_threads,
+ [&lhs, &rhs](const std::tuple<int, int>& range) {
+ auto [start, end] = range;
+ lhs.segment(start, end - start) =
+ rhs.segment(start, end - start);
+ });
+}
+
+// Set vector to zero using num_threads
+template <typename VectorType>
+void ParallelSetZero(ContextImpl* context,
+ int num_threads,
+ VectorType& vector) {
+ ParallelSetZero(context, num_threads, vector.data(), vector.rows());
+}
+void ParallelSetZero(ContextImpl* context,
+ int num_threads,
+ double* values,
+ int num_values);
+
+} // namespace ceres::internal
+
+#endif // CERES_INTERNAL_PARALLEL_FOR_H_
diff --git a/internal/ceres/partition_range_for_parallel_for.h b/internal/ceres/partition_range_for_parallel_for.h
new file mode 100644
index 0000000..309d7a8
--- /dev/null
+++ b/internal/ceres/partition_range_for_parallel_for.h
@@ -0,0 +1,150 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: vitus@google.com (Michael Vitus),
+// dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#ifndef CERES_INTERNAL_PARTITION_RANGE_FOR_PARALLEL_FOR_H_
+#define CERES_INTERNAL_PARTITION_RANGE_FOR_PARALLEL_FOR_H_
+
+#include <algorithm>
+#include <vector>
+
+namespace ceres::internal {
+// Check if it is possible to split range [start; end) into at most
+// max_num_partitions contiguous partitions of cost not greater than
+// max_partition_cost. Inclusive integer cumulative costs are provided by
+// cumulative_cost_data objects, with cumulative_cost_offset being a total cost
+// of all indices (starting from zero) preceding start element. Cumulative costs
+// are returned by cumulative_cost_fun called with a reference to
+// cumulative_cost_data element with index from range[start; end), and should be
+// non-decreasing. Partition of the range is returned via partition argument
+template <typename CumulativeCostData, typename CumulativeCostFun>
+bool MaxPartitionCostIsFeasible(int start,
+ int end,
+ int max_num_partitions,
+ int max_partition_cost,
+ int cumulative_cost_offset,
+ const CumulativeCostData* cumulative_cost_data,
+ CumulativeCostFun&& cumulative_cost_fun,
+ std::vector<int>* partition) {
+ partition->clear();
+ partition->push_back(start);
+ int partition_start = start;
+ int cost_offset = cumulative_cost_offset;
+
+ while (partition_start < end) {
+ // Already have max_num_partitions
+ if (partition->size() > max_num_partitions) {
+ return false;
+ }
+ const int target = max_partition_cost + cost_offset;
+ const int partition_end =
+ std::partition_point(
+ cumulative_cost_data + partition_start,
+ cumulative_cost_data + end,
+ [&cumulative_cost_fun, target](const CumulativeCostData& item) {
+ return cumulative_cost_fun(item) <= target;
+ }) -
+ cumulative_cost_data;
+ // Unable to make a partition from a single element
+ if (partition_end == partition_start) {
+ return false;
+ }
+
+ const int cost_last =
+ cumulative_cost_fun(cumulative_cost_data[partition_end - 1]);
+ partition->push_back(partition_end);
+ partition_start = partition_end;
+ cost_offset = cost_last;
+ }
+ return true;
+}
+
+// Split integer interval [start, end) into at most max_num_partitions
+// contiguous intervals, minimizing maximal total cost of a single interval.
+// Inclusive integer cumulative costs for each (zero-based) index are provided
+// by cumulative_cost_data objects, and are returned by cumulative_cost_fun call
+// with a reference to one of the objects from range [start, end)
+template <typename CumulativeCostData, typename CumulativeCostFun>
+std::vector<int> PartitionRangeForParallelFor(
+ int start,
+ int end,
+ int max_num_partitions,
+ const CumulativeCostData* cumulative_cost_data,
+ CumulativeCostFun&& cumulative_cost_fun) {
+ // Given maximal partition cost, it is possible to verify if it is admissible
+ // and obtain corresponding partition using MaxPartitionCostIsFeasible
+ // function. In order to find the lowest admissible value, a binary search
+ // over all potentially optimal cost values is being performed
+ const int cumulative_cost_last =
+ cumulative_cost_fun(cumulative_cost_data[end - 1]);
+ const int cumulative_cost_offset =
+ start ? cumulative_cost_fun(cumulative_cost_data[start - 1]) : 0;
+ const int total_cost = cumulative_cost_last - cumulative_cost_offset;
+
+ // Minimal maximal partition cost is not smaller than the average
+ // We will use non-inclusive lower bound
+ int partition_cost_lower_bound = total_cost / max_num_partitions - 1;
+ // Minimal maximal partition cost is not larger than the total cost
+ // Upper bound is inclusive
+ int partition_cost_upper_bound = total_cost;
+
+ std::vector<int> partition;
+ // Range partition corresponding to the latest evaluated upper bound.
+ // A single segment covering the whole input interval [start, end) corresponds
+ // to minimal maximal partition cost of total_cost.
+ std::vector<int> partition_upper_bound = {start, end};
+ // Binary search over partition cost, returning the lowest admissible cost
+ while (partition_cost_upper_bound - partition_cost_lower_bound > 1) {
+ partition.reserve(max_num_partitions + 1);
+ const int partition_cost =
+ partition_cost_lower_bound +
+ (partition_cost_upper_bound - partition_cost_lower_bound) / 2;
+ bool admissible = MaxPartitionCostIsFeasible(
+ start,
+ end,
+ max_num_partitions,
+ partition_cost,
+ cumulative_cost_offset,
+ cumulative_cost_data,
+ std::forward<CumulativeCostFun>(cumulative_cost_fun),
+ &partition);
+ if (admissible) {
+ partition_cost_upper_bound = partition_cost;
+ std::swap(partition, partition_upper_bound);
+ } else {
+ partition_cost_lower_bound = partition_cost;
+ }
+ }
+
+ return partition_upper_bound;
+}
+} // namespace ceres::internal
+
+#endif
diff --git a/internal/ceres/partitioned_matrix_view_impl.h b/internal/ceres/partitioned_matrix_view_impl.h
index e685f5b..385ceb1 100644
--- a/internal/ceres/partitioned_matrix_view_impl.h
+++ b/internal/ceres/partitioned_matrix_view_impl.h
@@ -37,6 +37,7 @@
#include "ceres/block_structure.h"
#include "ceres/internal/eigen.h"
#include "ceres/parallel_for.h"
+#include "ceres/partition_range_for_parallel_for.h"
#include "ceres/partitioned_matrix_view.h"
#include "ceres/small_blas.h"
#include "glog/logging.h"
@@ -87,14 +88,14 @@
const int num_threads = options_.num_threads;
if (transpose_bs != nullptr && num_threads > 1) {
int kMaxPartitions = num_threads * 4;
- e_cols_partition_ = parallel_for_details::ComputePartition(
+ e_cols_partition_ = PartitionRangeForParallelFor(
0,
num_col_blocks_e_,
kMaxPartitions,
transpose_bs->rows.data(),
[](const CompressedRow& row) { return row.cumulative_nnz; });
- f_cols_partition_ = parallel_for_details::ComputePartition(
+ f_cols_partition_ = PartitionRangeForParallelFor(
num_col_blocks_e_,
num_col_blocks_e_ + num_col_blocks_f_,
kMaxPartitions,
diff --git a/internal/ceres/program_evaluator.h b/internal/ceres/program_evaluator.h
index 7ccbbd1..372b260 100644
--- a/internal/ceres/program_evaluator.h
+++ b/internal/ceres/program_evaluator.h
@@ -43,7 +43,7 @@
// residual jacobians are written directly into their final position in the
// block sparse matrix by the user's CostFunction; there is no copying.
//
-// The evaluation is threaded with OpenMP or C++ threads.
+// The evaluation is threaded with C++ threads.
//
// The EvaluatePreparer and JacobianWriter interfaces are as follows:
//
@@ -96,6 +96,7 @@
#include "ceres/execution_summary.h"
#include "ceres/internal/eigen.h"
#include "ceres/parallel_for.h"
+#include "ceres/parallel_vector_ops.h"
#include "ceres/parameter_block.h"
#include "ceres/program.h"
#include "ceres/residual_block.h"
diff --git a/internal/ceres/wall_time.h b/internal/ceres/wall_time.h
index 36936ee..d503493 100644
--- a/internal/ceres/wall_time.h
+++ b/internal/ceres/wall_time.h
@@ -41,10 +41,8 @@
namespace ceres::internal {
-// Returns time, in seconds, from some arbitrary starting point. If
-// OpenMP is available then the high precision openmp_get_wtime()
-// function is used. Otherwise on unixes, gettimeofday is used. The
-// granularity is in seconds on windows systems.
+// Returns time, in seconds, from some arbitrary starting point. On unixes,
+// gettimeofday is used. The granularity is microseconds.
CERES_NO_EXPORT double WallTimeInSeconds();
// Log a series of events, recording for each event the time elapsed