| // Ceres Solver - A fast non-linear least squares minimizer |
| // Copyright 2018 Google Inc. All rights reserved. |
| // http://ceres-solver.org/ |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are met: |
| // |
| // * Redistributions of source code must retain the above copyright notice, |
| // this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above copyright notice, |
| // this list of conditions and the following disclaimer in the documentation |
| // and/or other materials provided with the distribution. |
| // * Neither the name of Google Inc. nor the names of its contributors may be |
| // used to endorse or promote products derived from this software without |
| // specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| // POSSIBILITY OF SUCH DAMAGE. |
| // |
| // Authors: vitus@google.com (Michael Vitus), |
| // dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) |
| |
| #ifndef CERES_INTERNAL_PARALLEL_FOR_H_ |
| #define CERES_INTERNAL_PARALLEL_FOR_H_ |
| |
| #include <algorithm> |
| #include <functional> |
| #include <mutex> |
| |
| #include "ceres/context_impl.h" |
| #include "ceres/internal/disable_warnings.h" |
| #include "ceres/internal/eigen.h" |
| #include "ceres/internal/export.h" |
| #include "glog/logging.h" |
| |
| namespace ceres::internal { |
| |
| // Use a dummy mutex if num_threads = 1. |
| inline decltype(auto) MakeConditionalLock(const int num_threads, |
| std::mutex& m) { |
| return (num_threads == 1) ? std::unique_lock<std::mutex>{} |
| : std::unique_lock<std::mutex>{m}; |
| } |
| |
| // Returns the maximum number of threads supported by the threading backend |
| // Ceres was compiled with. |
| CERES_NO_EXPORT |
| int MaxNumThreadsAvailable(); |
| |
| // Parallel for implementations share a common set of routines in order |
| // to enforce inlining of loop bodies, ensuring that single-threaded |
| // performance is equivalent to a simple for loop |
| namespace parallel_for_details { |
| // Get arguments of callable as a tuple |
| template <typename F, typename... Args> |
| std::tuple<std::decay_t<Args>...> args_of(void (F::*)(Args...) const); |
| |
| template <typename F> |
| using args_of_t = decltype(args_of(&F::operator())); |
| |
| // Parallelizable functions might require passing thread_id as the first |
| // argument. This class supplies thread_id argument to functions that |
| // support it and ignores it otherwise. |
| template <typename F, typename Args> |
| struct InvokeImpl; |
| |
| // For parallel for iterations of type [](int i) -> void |
| template <typename F> |
| struct InvokeImpl<F, std::tuple<int>> { |
| static void InvokeOnSegment(int thread_id, |
| int start, |
| int end, |
| const F& function) { |
| (void)thread_id; |
| for (int i = start; i < end; ++i) { |
| function(i); |
| } |
| } |
| }; |
| |
| // For parallel for iterations of type [](int thread_id, int i) -> void |
| template <typename F> |
| struct InvokeImpl<F, std::tuple<int, int>> { |
| static void InvokeOnSegment(int thread_id, |
| int start, |
| int end, |
| const F& function) { |
| for (int i = start; i < end; ++i) { |
| function(thread_id, i); |
| } |
| } |
| }; |
| |
| // For parallel for iterations of type [](tuple<int, int> range) -> void |
| template <typename F> |
| struct InvokeImpl<F, std::tuple<std::tuple<int, int>>> { |
| static void InvokeOnSegment(int thread_id, |
| int start, |
| int end, |
| const F& function) { |
| (void)thread_id; |
| function(std::make_tuple(start, end)); |
| } |
| }; |
| |
| // For parallel for iterations of type [](int thread_id, tuple<int, int> range) |
| // -> void |
| template <typename F> |
| struct InvokeImpl<F, std::tuple<int, std::tuple<int, int>>> { |
| static void InvokeOnSegment(int thread_id, |
| int start, |
| int end, |
| const F& function) { |
| function(thread_id, std::make_tuple(start, end)); |
| } |
| }; |
| |
| // Invoke function passing thread_id only if required |
| template <typename F> |
| void InvokeOnSegment(int thread_id, int start, int end, const F& function) { |
| InvokeImpl<F, args_of_t<F>>::InvokeOnSegment(thread_id, start, end, function); |
| } |
| |
| // Check if it is possible to split range [start; end) into at most |
| // max_num_partitions contiguous partitions of cost not greater than |
| // max_partition_cost. Inclusive integer cumulative costs are provided by |
| // cumulative_cost_data objects, with cumulative_cost_offset being a total cost |
| // of all indices (starting from zero) preceding start element. Cumulative costs |
| // are returned by cumulative_cost_fun called with a reference to |
| // cumulative_cost_data element with index from range[start; end), and should be |
| // non-decreasing. Partition of the range is returned via partition argument |
| template <typename CumulativeCostData, typename CumulativeCostFun> |
| bool MaxPartitionCostIsFeasible(int start, |
| int end, |
| int max_num_partitions, |
| int max_partition_cost, |
| int cumulative_cost_offset, |
| const CumulativeCostData* cumulative_cost_data, |
| const CumulativeCostFun& cumulative_cost_fun, |
| std::vector<int>* partition) { |
| partition->clear(); |
| partition->push_back(start); |
| int partition_start = start; |
| int cost_offset = cumulative_cost_offset; |
| |
| const CumulativeCostData* const range_end = cumulative_cost_data + end; |
| while (partition_start < end) { |
| // Already have max_num_partitions |
| if (partition->size() > max_num_partitions) { |
| return false; |
| } |
| const int target = max_partition_cost + cost_offset; |
| const int partition_end = |
| std::partition_point( |
| cumulative_cost_data + partition_start, |
| cumulative_cost_data + end, |
| [&cumulative_cost_fun, target](const CumulativeCostData& item) { |
| return cumulative_cost_fun(item) <= target; |
| }) - |
| cumulative_cost_data; |
| // Unable to make a partition from a single element |
| if (partition_end == partition_start) { |
| return false; |
| } |
| |
| const int cost_last = |
| cumulative_cost_fun(cumulative_cost_data[partition_end - 1]); |
| partition->push_back(partition_end); |
| partition_start = partition_end; |
| cost_offset = cost_last; |
| } |
| return true; |
| } |
| |
| // Split integer interval [start, end) into at most max_num_partitions |
| // contiguous intervals, minimizing maximal total cost of a single interval. |
| // Inclusive integer cumulative costs for each (zero-based) index are provided |
| // by cumulative_cost_data objects, and are returned by cumulative_cost_fun call |
| // with a reference to one of the objects from range [start, end) |
| template <typename CumulativeCostData, typename CumulativeCostFun> |
| std::vector<int> ComputePartition( |
| int start, |
| int end, |
| int max_num_partitions, |
| const CumulativeCostData* cumulative_cost_data, |
| const CumulativeCostFun& cumulative_cost_fun) { |
| // Given maximal partition cost, it is possible to verify if it is admissible |
| // and obtain corresponding partition using MaxPartitionCostIsFeasible |
| // function. In order to find the lowest admissible value, a binary search |
| // over all potentially optimal cost values is being performed |
| const int cumulative_cost_last = |
| cumulative_cost_fun(cumulative_cost_data[end - 1]); |
| const int cumulative_cost_offset = |
| start ? cumulative_cost_fun(cumulative_cost_data[start - 1]) : 0; |
| const int total_cost = cumulative_cost_last - cumulative_cost_offset; |
| |
| // Minimal maximal partition cost is not smaller than the average |
| // We will use non-inclusive lower bound |
| int partition_cost_lower_bound = total_cost / max_num_partitions - 1; |
| // Minimal maximal partition cost is not larger than the total cost |
| // Upper bound is inclusive |
| int partition_cost_upper_bound = total_cost; |
| |
| std::vector<int> partition, partition_upper_bound; |
| // Binary search over partition cost, returning the lowest admissible cost |
| while (partition_cost_upper_bound - partition_cost_lower_bound > 1) { |
| partition.reserve(max_num_partitions + 1); |
| const int partition_cost = |
| partition_cost_lower_bound + |
| (partition_cost_upper_bound - partition_cost_lower_bound) / 2; |
| bool admissible = MaxPartitionCostIsFeasible(start, |
| end, |
| max_num_partitions, |
| partition_cost, |
| cumulative_cost_offset, |
| cumulative_cost_data, |
| cumulative_cost_fun, |
| &partition); |
| if (admissible) { |
| partition_cost_upper_bound = partition_cost; |
| std::swap(partition, partition_upper_bound); |
| } else { |
| partition_cost_lower_bound = partition_cost; |
| } |
| } |
| |
| // After binary search over partition cost, interval |
| // (partition_cost_lower_bound, partition_cost_upper_bound] contains the only |
| // admissible partition cost value - partition_cost_upper_bound |
| // |
| // Partition for this cost value might have been already computed |
| if (partition_upper_bound.empty() == false) { |
| return partition_upper_bound; |
| } |
| // Partition for upper bound is not computed if and only if upper bound was |
| // never updated This is a simple case of a single interval containing all |
| // values, which we were not able to break into pieces |
| partition = {start, end}; |
| return partition; |
| } |
| } // namespace parallel_for_details |
| |
| // Forward declaration of parallel invocation function that is to be |
| // implemented by each threading backend |
| template <typename F> |
| void ParallelInvoke(ContextImpl* context, |
| int start, |
| int end, |
| int num_threads, |
| const F& function); |
| |
| // Execute the function for every element in the range [start, end) with at most |
| // num_threads. It will execute all the work on the calling thread if |
| // num_threads or (end - start) is equal to 1. |
| // |
| // Functions with two arguments will be passed thread_id and loop index on each |
| // invocation, functions with one argument will be invoked with loop index |
| template <typename F> |
| void ParallelFor(ContextImpl* context, |
| int start, |
| int end, |
| int num_threads, |
| const F& function) { |
| using namespace parallel_for_details; |
| CHECK_GT(num_threads, 0); |
| if (start >= end) { |
| return; |
| } |
| |
| if (num_threads == 1 || end - start == 1) { |
| InvokeOnSegment<F>(0, start, end, function); |
| return; |
| } |
| |
| CHECK(context != nullptr); |
| ParallelInvoke<F>(context, start, end, num_threads, function); |
| } |
| |
| // Execute function for every element in the range [start, end) with at most |
| // num_threads, taking into account user-provided integer cumulative costs of |
| // iterations. Cumulative costs of iteration for indices in range [0, end) are |
| // stored in objects from cumulative_cost_data. User-provided |
| // cumulative_cost_fun returns non-decreasing integer values corresponding to |
| // inclusive cumulative cost of loop iterations, provided with a reference to |
| // user-defined object. Only indices from [start, end) will be referenced. This |
| // routine assumes that cumulative_cost_fun is non-decreasing (in other words, |
| // all costs are non-negative); |
| template <typename F, typename CumulativeCostData, typename CumulativeCostFun> |
| void ParallelFor(ContextImpl* context, |
| int start, |
| int end, |
| int num_threads, |
| const F& function, |
| const CumulativeCostData* cumulative_cost_data, |
| const CumulativeCostFun& cumulative_cost_fun) { |
| using namespace parallel_for_details; |
| CHECK_GT(num_threads, 0); |
| if (start >= end) { |
| return; |
| } |
| if (num_threads == 1 || end - start <= num_threads) { |
| ParallelFor(context, start, end, num_threads, function); |
| return; |
| } |
| // Creating several partitions allows us to tolerate imperfections of |
| // partitioning and user-supplied iteration costs up to a certain extent |
| const int kNumPartitionsPerThread = 4; |
| const int kMaxPartitions = num_threads * kNumPartitionsPerThread; |
| const std::vector<int> partitions = ComputePartition( |
| start, end, kMaxPartitions, cumulative_cost_data, cumulative_cost_fun); |
| CHECK_GT(partitions.size(), 1); |
| const int num_partitions = partitions.size() - 1; |
| ParallelFor(context, |
| 0, |
| num_partitions, |
| num_threads, |
| [&function, &partitions](int thread_id, int partition_id) { |
| const int partition_start = partitions[partition_id]; |
| const int partition_end = partitions[partition_id + 1]; |
| |
| InvokeOnSegment<F>( |
| thread_id, partition_start, partition_end, function); |
| }); |
| } |
| |
| // Execute function for every element in the range [start, end) with at most |
| // num_threads, using the user provided partitioning. taking into account |
| // user-provided integer cumulative costs of iterations. |
| template <typename F> |
| void ParallelFor(ContextImpl* context, |
| int start, |
| int end, |
| int num_threads, |
| const F& function, |
| const std::vector<int>& partitions) { |
| using namespace parallel_for_details; |
| CHECK_GT(num_threads, 0); |
| if (start >= end) { |
| return; |
| } |
| CHECK_EQ(partitions.front(), start); |
| CHECK_EQ(partitions.back(), end); |
| if (num_threads == 1 || end - start <= num_threads) { |
| ParallelFor(context, start, end, num_threads, function); |
| return; |
| } |
| CHECK_GT(partitions.size(), 1); |
| const int num_partitions = partitions.size() - 1; |
| ParallelFor(context, |
| 0, |
| num_partitions, |
| num_threads, |
| [&function, &partitions](int thread_id, int partition_id) { |
| const int partition_start = partitions[partition_id]; |
| const int partition_end = partitions[partition_id + 1]; |
| |
| InvokeOnSegment<F>( |
| thread_id, partition_start, partition_end, function); |
| }); |
| } |
| |
| // Evaluate vector expression in parallel |
| // Assuming LhsExpression and RhsExpression are some sort of |
| // column-vector expression, assignment lhs = rhs |
| // is eavluated over a set of contiguous blocks in parallel. |
| // This is expected to work well in the case of vector-based |
| // expressions (since they typically do not result into |
| // temporaries). |
| // This method expects lhs to be size-compatible with rhs |
| template <typename LhsExpression, typename RhsExpression> |
| void ParallelAssign(ContextImpl* context, |
| int num_threads, |
| LhsExpression& lhs, |
| const RhsExpression& rhs) { |
| static_assert(LhsExpression::ColsAtCompileTime == 1); |
| static_assert(RhsExpression::ColsAtCompileTime == 1); |
| CHECK_EQ(lhs.rows(), rhs.rows()); |
| const int num_rows = lhs.rows(); |
| ParallelFor(context, |
| 0, |
| num_rows, |
| num_threads, |
| [&lhs, &rhs](const std::tuple<int, int>& range) { |
| auto [start, end] = range; |
| lhs.segment(start, end - start) = |
| rhs.segment(start, end - start); |
| }); |
| } |
| |
| // Set vector to zero using num_threads |
| template <typename VectorType> |
| void ParallelSetZero(ContextImpl* context, |
| int num_threads, |
| VectorType& vector) { |
| ParallelSetZero(context, num_threads, vector.data(), vector.rows()); |
| } |
| void ParallelSetZero(ContextImpl* context, |
| int num_threads, |
| double* values, |
| int num_values); |
| } // namespace ceres::internal |
| |
| // Backend-specific implementations of ParallelInvoke |
| #include "ceres/internal/disable_warnings.h" |
| #include "ceres/parallel_for_cxx.h" |
| |
| #endif // CERES_INTERNAL_PARALLEL_FOR_H_ |