| // Ceres Solver - A fast non-linear least squares minimizer | 
 | // Copyright 2018 Google Inc. All rights reserved. | 
 | // http://ceres-solver.org/ | 
 | // | 
 | // Redistribution and use in source and binary forms, with or without | 
 | // modification, are permitted provided that the following conditions are met: | 
 | // | 
 | // * Redistributions of source code must retain the above copyright notice, | 
 | //   this list of conditions and the following disclaimer. | 
 | // * Redistributions in binary form must reproduce the above copyright notice, | 
 | //   this list of conditions and the following disclaimer in the documentation | 
 | //   and/or other materials provided with the distribution. | 
 | // * Neither the name of Google Inc. nor the names of its contributors may be | 
 | //   used to endorse or promote products derived from this software without | 
 | //   specific prior written permission. | 
 | // | 
 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
 | // POSSIBILITY OF SUCH DAMAGE. | 
 | // | 
 | // Authors: vitus@google.com (Michael Vitus), | 
 | //          dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) | 
 |  | 
 | #ifndef CERES_INTERNAL_PARALLEL_FOR_H_ | 
 | #define CERES_INTERNAL_PARALLEL_FOR_H_ | 
 |  | 
 | #include <algorithm> | 
 | #include <functional> | 
 | #include <mutex> | 
 |  | 
 | #include "ceres/context_impl.h" | 
 | #include "ceres/internal/disable_warnings.h" | 
 | #include "ceres/internal/eigen.h" | 
 | #include "ceres/internal/export.h" | 
 | #include "glog/logging.h" | 
 |  | 
 | namespace ceres::internal { | 
 |  | 
 | // Use a dummy mutex if num_threads = 1. | 
 | inline decltype(auto) MakeConditionalLock(const int num_threads, | 
 |                                           std::mutex& m) { | 
 |   return (num_threads == 1) ? std::unique_lock<std::mutex>{} | 
 |                             : std::unique_lock<std::mutex>{m}; | 
 | } | 
 |  | 
 | // Returns the maximum number of threads supported by the threading backend | 
 | // Ceres was compiled with. | 
 | CERES_NO_EXPORT | 
 | int MaxNumThreadsAvailable(); | 
 |  | 
 | // Parallel for implementations share a common set of routines in order | 
 | // to enforce inlining of loop bodies, ensuring that single-threaded | 
 | // performance is equivalent to a simple for loop | 
 | namespace parallel_for_details { | 
 | // Get arguments of callable as a tuple | 
 | template <typename F, typename... Args> | 
 | std::tuple<std::decay_t<Args>...> args_of(void (F::*)(Args...) const); | 
 |  | 
 | template <typename F> | 
 | using args_of_t = decltype(args_of(&F::operator())); | 
 |  | 
 | // Parallelizable functions might require passing thread_id as the first | 
 | // argument. This class supplies thread_id argument to functions that | 
 | // support it and ignores it otherwise. | 
 | template <typename F, typename Args> | 
 | struct InvokeImpl; | 
 |  | 
 | // For parallel for iterations of type [](int i) -> void | 
 | template <typename F> | 
 | struct InvokeImpl<F, std::tuple<int>> { | 
 |   static void InvokeOnSegment(int thread_id, | 
 |                               int start, | 
 |                               int end, | 
 |                               const F& function) { | 
 |     (void)thread_id; | 
 |     for (int i = start; i < end; ++i) { | 
 |       function(i); | 
 |     } | 
 |   } | 
 | }; | 
 |  | 
 | // For parallel for iterations of type [](int thread_id, int i) -> void | 
 | template <typename F> | 
 | struct InvokeImpl<F, std::tuple<int, int>> { | 
 |   static void InvokeOnSegment(int thread_id, | 
 |                               int start, | 
 |                               int end, | 
 |                               const F& function) { | 
 |     for (int i = start; i < end; ++i) { | 
 |       function(thread_id, i); | 
 |     } | 
 |   } | 
 | }; | 
 |  | 
 | // For parallel for iterations of type [](tuple<int, int> range) -> void | 
 | template <typename F> | 
 | struct InvokeImpl<F, std::tuple<std::tuple<int, int>>> { | 
 |   static void InvokeOnSegment(int thread_id, | 
 |                               int start, | 
 |                               int end, | 
 |                               const F& function) { | 
 |     (void)thread_id; | 
 |     function(std::make_tuple(start, end)); | 
 |   } | 
 | }; | 
 |  | 
 | // For parallel for iterations of type [](int thread_id, tuple<int, int> range) | 
 | // -> void | 
 | template <typename F> | 
 | struct InvokeImpl<F, std::tuple<int, std::tuple<int, int>>> { | 
 |   static void InvokeOnSegment(int thread_id, | 
 |                               int start, | 
 |                               int end, | 
 |                               const F& function) { | 
 |     function(thread_id, std::make_tuple(start, end)); | 
 |   } | 
 | }; | 
 |  | 
 | // Invoke function passing thread_id only if required | 
 | template <typename F> | 
 | void InvokeOnSegment(int thread_id, int start, int end, const F& function) { | 
 |   InvokeImpl<F, args_of_t<F>>::InvokeOnSegment(thread_id, start, end, function); | 
 | } | 
 |  | 
 | // Check if it is possible to split range [start; end) into at most | 
 | // max_num_partitions  contiguous partitions of cost not greater than | 
 | // max_partition_cost. Inclusive integer cumulative costs are provided by | 
 | // cumulative_cost_data objects, with cumulative_cost_offset being a total cost | 
 | // of all indices (starting from zero) preceding start element. Cumulative costs | 
 | // are returned by cumulative_cost_fun called with a reference to | 
 | // cumulative_cost_data element with index from range[start; end), and should be | 
 | // non-decreasing. Partition of the range is returned via partition argument | 
 | template <typename CumulativeCostData, typename CumulativeCostFun> | 
 | bool MaxPartitionCostIsFeasible(int start, | 
 |                                 int end, | 
 |                                 int max_num_partitions, | 
 |                                 int max_partition_cost, | 
 |                                 int cumulative_cost_offset, | 
 |                                 const CumulativeCostData* cumulative_cost_data, | 
 |                                 const CumulativeCostFun& cumulative_cost_fun, | 
 |                                 std::vector<int>* partition) { | 
 |   partition->clear(); | 
 |   partition->push_back(start); | 
 |   int partition_start = start; | 
 |   int cost_offset = cumulative_cost_offset; | 
 |  | 
 |   const CumulativeCostData* const range_end = cumulative_cost_data + end; | 
 |   while (partition_start < end) { | 
 |     // Already have max_num_partitions | 
 |     if (partition->size() > max_num_partitions) { | 
 |       return false; | 
 |     } | 
 |     const int target = max_partition_cost + cost_offset; | 
 |     const int partition_end = | 
 |         std::partition_point( | 
 |             cumulative_cost_data + partition_start, | 
 |             cumulative_cost_data + end, | 
 |             [&cumulative_cost_fun, target](const CumulativeCostData& item) { | 
 |               return cumulative_cost_fun(item) <= target; | 
 |             }) - | 
 |         cumulative_cost_data; | 
 |     // Unable to make a partition from a single element | 
 |     if (partition_end == partition_start) { | 
 |       return false; | 
 |     } | 
 |  | 
 |     const int cost_last = | 
 |         cumulative_cost_fun(cumulative_cost_data[partition_end - 1]); | 
 |     partition->push_back(partition_end); | 
 |     partition_start = partition_end; | 
 |     cost_offset = cost_last; | 
 |   } | 
 |   return true; | 
 | } | 
 |  | 
 | // Split integer interval [start, end) into at most max_num_partitions | 
 | // contiguous intervals, minimizing maximal total cost of a single interval. | 
 | // Inclusive integer cumulative costs for each (zero-based) index are provided | 
 | // by cumulative_cost_data objects, and are returned by cumulative_cost_fun call | 
 | // with a reference to one of the objects from range [start, end) | 
 | template <typename CumulativeCostData, typename CumulativeCostFun> | 
 | std::vector<int> ComputePartition( | 
 |     int start, | 
 |     int end, | 
 |     int max_num_partitions, | 
 |     const CumulativeCostData* cumulative_cost_data, | 
 |     const CumulativeCostFun& cumulative_cost_fun) { | 
 |   // Given maximal partition cost, it is possible to verify if it is admissible | 
 |   // and obtain corresponding partition using MaxPartitionCostIsFeasible | 
 |   // function. In order to find the lowest admissible value, a binary search | 
 |   // over all potentially optimal cost values is being performed | 
 |   const int cumulative_cost_last = | 
 |       cumulative_cost_fun(cumulative_cost_data[end - 1]); | 
 |   const int cumulative_cost_offset = | 
 |       start ? cumulative_cost_fun(cumulative_cost_data[start - 1]) : 0; | 
 |   const int total_cost = cumulative_cost_last - cumulative_cost_offset; | 
 |  | 
 |   // Minimal maximal partition cost is not smaller than the average | 
 |   // We will use non-inclusive lower bound | 
 |   int partition_cost_lower_bound = total_cost / max_num_partitions - 1; | 
 |   // Minimal maximal partition cost is not larger than the total cost | 
 |   // Upper bound is inclusive | 
 |   int partition_cost_upper_bound = total_cost; | 
 |  | 
 |   std::vector<int> partition, partition_upper_bound; | 
 |   // Binary search over partition cost, returning the lowest admissible cost | 
 |   while (partition_cost_upper_bound - partition_cost_lower_bound > 1) { | 
 |     partition.reserve(max_num_partitions + 1); | 
 |     const int partition_cost = | 
 |         partition_cost_lower_bound + | 
 |         (partition_cost_upper_bound - partition_cost_lower_bound) / 2; | 
 |     bool admissible = MaxPartitionCostIsFeasible(start, | 
 |                                                  end, | 
 |                                                  max_num_partitions, | 
 |                                                  partition_cost, | 
 |                                                  cumulative_cost_offset, | 
 |                                                  cumulative_cost_data, | 
 |                                                  cumulative_cost_fun, | 
 |                                                  &partition); | 
 |     if (admissible) { | 
 |       partition_cost_upper_bound = partition_cost; | 
 |       std::swap(partition, partition_upper_bound); | 
 |     } else { | 
 |       partition_cost_lower_bound = partition_cost; | 
 |     } | 
 |   } | 
 |  | 
 |   // After binary search over partition cost, interval | 
 |   // (partition_cost_lower_bound, partition_cost_upper_bound] contains the only | 
 |   // admissible partition cost value - partition_cost_upper_bound | 
 |   // | 
 |   // Partition for this cost value might have been already computed | 
 |   if (partition_upper_bound.empty() == false) { | 
 |     return partition_upper_bound; | 
 |   } | 
 |   // Partition for upper bound is not computed if and only if upper bound was | 
 |   // never updated This is a simple case of a single interval containing all | 
 |   // values, which we were not able to break into pieces | 
 |   partition = {start, end}; | 
 |   return partition; | 
 | } | 
 | }  // namespace parallel_for_details | 
 |  | 
 | // Forward declaration of parallel invocation function that is to be | 
 | // implemented by each threading backend | 
 | template <typename F> | 
 | void ParallelInvoke(ContextImpl* context, | 
 |                     int start, | 
 |                     int end, | 
 |                     int num_threads, | 
 |                     const F& function); | 
 |  | 
 | // Execute the function for every element in the range [start, end) with at most | 
 | // num_threads. It will execute all the work on the calling thread if | 
 | // num_threads or (end - start) is equal to 1. | 
 | // | 
 | // Functions with two arguments will be passed thread_id and loop index on each | 
 | // invocation, functions with one argument will be invoked with loop index | 
 | template <typename F> | 
 | void ParallelFor(ContextImpl* context, | 
 |                  int start, | 
 |                  int end, | 
 |                  int num_threads, | 
 |                  const F& function) { | 
 |   using namespace parallel_for_details; | 
 |   CHECK_GT(num_threads, 0); | 
 |   if (start >= end) { | 
 |     return; | 
 |   } | 
 |  | 
 |   if (num_threads == 1 || end - start == 1) { | 
 |     InvokeOnSegment<F>(0, start, end, function); | 
 |     return; | 
 |   } | 
 |  | 
 |   CHECK(context != nullptr); | 
 |   ParallelInvoke<F>(context, start, end, num_threads, function); | 
 | } | 
 |  | 
 | // Execute function for every element in the range [start, end) with at most | 
 | // num_threads, taking into account user-provided integer cumulative costs of | 
 | // iterations. Cumulative costs of iteration for indices in range [0, end) are | 
 | // stored in objects from cumulative_cost_data. User-provided | 
 | // cumulative_cost_fun returns non-decreasing integer values corresponding to | 
 | // inclusive cumulative cost of loop iterations, provided with a reference to | 
 | // user-defined object. Only indices from [start, end) will be referenced. This | 
 | // routine assumes that cumulative_cost_fun is non-decreasing (in other words, | 
 | // all costs are non-negative); | 
 | template <typename F, typename CumulativeCostData, typename CumulativeCostFun> | 
 | void ParallelFor(ContextImpl* context, | 
 |                  int start, | 
 |                  int end, | 
 |                  int num_threads, | 
 |                  const F& function, | 
 |                  const CumulativeCostData* cumulative_cost_data, | 
 |                  const CumulativeCostFun& cumulative_cost_fun) { | 
 |   using namespace parallel_for_details; | 
 |   CHECK_GT(num_threads, 0); | 
 |   if (start >= end) { | 
 |     return; | 
 |   } | 
 |   if (num_threads == 1 || end - start <= num_threads) { | 
 |     ParallelFor(context, start, end, num_threads, function); | 
 |     return; | 
 |   } | 
 |   // Creating several partitions allows us to tolerate imperfections of | 
 |   // partitioning and user-supplied iteration costs up to a certain extent | 
 |   const int kNumPartitionsPerThread = 4; | 
 |   const int kMaxPartitions = num_threads * kNumPartitionsPerThread; | 
 |   const std::vector<int> partitions = ComputePartition( | 
 |       start, end, kMaxPartitions, cumulative_cost_data, cumulative_cost_fun); | 
 |   CHECK_GT(partitions.size(), 1); | 
 |   const int num_partitions = partitions.size() - 1; | 
 |   ParallelFor(context, | 
 |               0, | 
 |               num_partitions, | 
 |               num_threads, | 
 |               [&function, &partitions](int thread_id, int partition_id) { | 
 |                 const int partition_start = partitions[partition_id]; | 
 |                 const int partition_end = partitions[partition_id + 1]; | 
 |  | 
 |                 InvokeOnSegment<F>( | 
 |                     thread_id, partition_start, partition_end, function); | 
 |               }); | 
 | } | 
 |  | 
 | // Execute function for every element in the range [start, end) with at most | 
 | // num_threads, using the user provided partitioning. taking into account | 
 | // user-provided integer cumulative costs of iterations. | 
 | template <typename F> | 
 | void ParallelFor(ContextImpl* context, | 
 |                  int start, | 
 |                  int end, | 
 |                  int num_threads, | 
 |                  const F& function, | 
 |                  const std::vector<int>& partitions) { | 
 |   using namespace parallel_for_details; | 
 |   CHECK_GT(num_threads, 0); | 
 |   if (start >= end) { | 
 |     return; | 
 |   } | 
 |   CHECK_EQ(partitions.front(), start); | 
 |   CHECK_EQ(partitions.back(), end); | 
 |   if (num_threads == 1 || end - start <= num_threads) { | 
 |     ParallelFor(context, start, end, num_threads, function); | 
 |     return; | 
 |   } | 
 |   CHECK_GT(partitions.size(), 1); | 
 |   const int num_partitions = partitions.size() - 1; | 
 |   ParallelFor(context, | 
 |               0, | 
 |               num_partitions, | 
 |               num_threads, | 
 |               [&function, &partitions](int thread_id, int partition_id) { | 
 |                 const int partition_start = partitions[partition_id]; | 
 |                 const int partition_end = partitions[partition_id + 1]; | 
 |  | 
 |                 InvokeOnSegment<F>( | 
 |                     thread_id, partition_start, partition_end, function); | 
 |               }); | 
 | } | 
 |  | 
 | // Evaluate vector expression in parallel | 
 | // Assuming LhsExpression and RhsExpression are some sort of | 
 | // column-vector expression, assignment lhs = rhs | 
 | // is eavluated over a set of contiguous blocks in parallel. | 
 | // This is expected to work well in the case of vector-based | 
 | // expressions (since they typically do not result into | 
 | // temporaries). | 
 | // This method expects lhs to be size-compatible with rhs | 
 | template <typename LhsExpression, typename RhsExpression> | 
 | void ParallelAssign(ContextImpl* context, | 
 |                     int num_threads, | 
 |                     LhsExpression& lhs, | 
 |                     const RhsExpression& rhs) { | 
 |   static_assert(LhsExpression::ColsAtCompileTime == 1); | 
 |   static_assert(RhsExpression::ColsAtCompileTime == 1); | 
 |   CHECK_EQ(lhs.rows(), rhs.rows()); | 
 |   const int num_rows = lhs.rows(); | 
 |   ParallelFor(context, | 
 |               0, | 
 |               num_rows, | 
 |               num_threads, | 
 |               [&lhs, &rhs](const std::tuple<int, int>& range) { | 
 |                 auto [start, end] = range; | 
 |                 lhs.segment(start, end - start) = | 
 |                     rhs.segment(start, end - start); | 
 |               }); | 
 | } | 
 |  | 
 | // Set vector to zero using num_threads | 
 | template <typename VectorType> | 
 | void ParallelSetZero(ContextImpl* context, | 
 |                      int num_threads, | 
 |                      VectorType& vector) { | 
 |   ParallelSetZero(context, num_threads, vector.data(), vector.rows()); | 
 | } | 
 | void ParallelSetZero(ContextImpl* context, | 
 |                      int num_threads, | 
 |                      double* values, | 
 |                      int num_values); | 
 | }  // namespace ceres::internal | 
 |  | 
 | // Backend-specific implementations of ParallelInvoke | 
 | #include "ceres/internal/disable_warnings.h" | 
 | #include "ceres/parallel_for_cxx.h" | 
 |  | 
 | #endif  // CERES_INTERNAL_PARALLEL_FOR_H_ |