| // Ceres Solver - A fast non-linear least squares minimizer | 
 | // Copyright 2023 Google Inc. All rights reserved. | 
 | // http://ceres-solver.org/ | 
 | // | 
 | // Redistribution and use in source and binary forms, with or without | 
 | // modification, are permitted provided that the following conditions are met: | 
 | // | 
 | // * Redistributions of source code must retain the above copyright notice, | 
 | //   this list of conditions and the following disclaimer. | 
 | // * Redistributions in binary form must reproduce the above copyright notice, | 
 | //   this list of conditions and the following disclaimer in the documentation | 
 | //   and/or other materials provided with the distribution. | 
 | // * Neither the name of Google Inc. nor the names of its contributors may be | 
 | //   used to endorse or promote products derived from this software without | 
 | //   specific prior written permission. | 
 | // | 
 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
 | // POSSIBILITY OF SUCH DAMAGE. | 
 | // | 
 | // Authors: vitus@google.com (Michael Vitus), | 
 | //          dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) | 
 |  | 
 | #ifndef CERES_INTERNAL_PARALLEL_FOR_H_ | 
 | #define CERES_INTERNAL_PARALLEL_FOR_H_ | 
 |  | 
 | #include <mutex> | 
 | #include <vector> | 
 |  | 
 | #include "absl/log/check.h" | 
 | #include "ceres/context_impl.h" | 
 | #include "ceres/internal/eigen.h" | 
 | #include "ceres/internal/export.h" | 
 | #include "ceres/parallel_invoke.h" | 
 | #include "ceres/partition_range_for_parallel_for.h" | 
 |  | 
 | namespace ceres::internal { | 
 |  | 
 | // Use a dummy mutex if num_threads = 1. | 
 | inline decltype(auto) MakeConditionalLock(const int num_threads, | 
 |                                           std::mutex& m) { | 
 |   return (num_threads == 1) ? std::unique_lock<std::mutex>{} | 
 |                             : std::unique_lock<std::mutex>{m}; | 
 | } | 
 |  | 
 | // Execute the function for every element in the range [start, end) with at most | 
 | // num_threads. It will execute all the work on the calling thread if | 
 | // num_threads or (end - start) is equal to 1. | 
 | // Depending on function signature, it will be supplied with either loop index | 
 | // or a range of loop indices; function can also be supplied with thread_id. | 
 | // The following function signatures are supported: | 
 | //  - Functions accepting a single loop index: | 
 | //     - [](int index) { ... } | 
 | //     - [](int thread_id, int index) { ... } | 
 | //  - Functions accepting a range of loop index: | 
 | //     - [](std::tuple<int, int> index) { ... } | 
 | //     - [](int thread_id, std::tuple<int, int> index) { ... } | 
 | // | 
 | // When distributing workload between threads, it is assumed that each loop | 
 | // iteration takes approximately equal time to complete. | 
 | template <typename F> | 
 | void ParallelFor(ContextImpl* context, | 
 |                  int start, | 
 |                  int end, | 
 |                  int num_threads, | 
 |                  F&& function, | 
 |                  int min_block_size = 1) { | 
 |   CHECK_GT(num_threads, 0); | 
 |   if (start >= end) { | 
 |     return; | 
 |   } | 
 |  | 
 |   if (num_threads == 1 || end - start < min_block_size * 2) { | 
 |     InvokeOnSegment(0, std::make_tuple(start, end), std::forward<F>(function)); | 
 |     return; | 
 |   } | 
 |  | 
 |   CHECK(context != nullptr); | 
 |   ParallelInvoke(context, | 
 |                  start, | 
 |                  end, | 
 |                  num_threads, | 
 |                  std::forward<F>(function), | 
 |                  min_block_size); | 
 | } | 
 |  | 
 | // Execute function for every element in the range [start, end) with at most | 
 | // num_threads, using user-provided partitions array. | 
 | // When distributing workload between threads, it is assumed that each segment | 
 | // bounded by adjacent elements of partitions array takes approximately equal | 
 | // time to process. | 
 | template <typename F> | 
 | void ParallelFor(ContextImpl* context, | 
 |                  int start, | 
 |                  int end, | 
 |                  int num_threads, | 
 |                  F&& function, | 
 |                  const std::vector<int>& partitions) { | 
 |   CHECK_GT(num_threads, 0); | 
 |   if (start >= end) { | 
 |     return; | 
 |   } | 
 |   CHECK_EQ(partitions.front(), start); | 
 |   CHECK_EQ(partitions.back(), end); | 
 |   if (num_threads == 1 || end - start <= num_threads) { | 
 |     ParallelFor(context, start, end, num_threads, std::forward<F>(function)); | 
 |     return; | 
 |   } | 
 |   CHECK_GT(partitions.size(), 1); | 
 |   const int num_partitions = partitions.size() - 1; | 
 |   ParallelFor(context, | 
 |               0, | 
 |               num_partitions, | 
 |               num_threads, | 
 |               [&function, &partitions](int thread_id, | 
 |                                        std::tuple<int, int> partition_ids) { | 
 |                 // partition_ids is a range of partition indices | 
 |                 const auto [partition_start, partition_end] = partition_ids; | 
 |                 // Execution over several adjacent segments is equivalent | 
 |                 // to execution over union of those segments (which is also a | 
 |                 // contiguous segment) | 
 |                 const int range_start = partitions[partition_start]; | 
 |                 const int range_end = partitions[partition_end]; | 
 |                 // Range of original loop indices | 
 |                 const auto range = std::make_tuple(range_start, range_end); | 
 |                 InvokeOnSegment(thread_id, range, function); | 
 |               }); | 
 | } | 
 |  | 
 | // Execute function for every element in the range [start, end) with at most | 
 | // num_threads, taking into account user-provided integer cumulative costs of | 
 | // iterations. Cumulative costs of iteration for indices in range [0, end) are | 
 | // stored in objects from cumulative_cost_data. User-provided | 
 | // cumulative_cost_fun returns non-decreasing integer values corresponding to | 
 | // inclusive cumulative cost of loop iterations, provided with a reference to | 
 | // user-defined object. Only indices from [start, end) will be referenced. This | 
 | // routine assumes that cumulative_cost_fun is non-decreasing (in other words, | 
 | // all costs are non-negative); | 
 | // When distributing workload between threads, input range of loop indices will | 
 | // be partitioned into disjoint contiguous intervals, with the maximal cost | 
 | // being minimized. | 
 | // For example, with iteration costs of [1, 1, 5, 3, 1, 4] cumulative_cost_fun | 
 | // should return [1, 2, 7, 10, 11, 15], and with num_threads = 4 this range | 
 | // will be split into segments [0, 2) [2, 3) [3, 5) [5, 6) with costs | 
 | // [2, 5, 4, 4]. | 
 | template <typename F, typename CumulativeCostData, typename CumulativeCostFun> | 
 | void ParallelFor(ContextImpl* context, | 
 |                  int start, | 
 |                  int end, | 
 |                  int num_threads, | 
 |                  F&& function, | 
 |                  const CumulativeCostData* cumulative_cost_data, | 
 |                  CumulativeCostFun&& cumulative_cost_fun) { | 
 |   CHECK_GT(num_threads, 0); | 
 |   if (start >= end) { | 
 |     return; | 
 |   } | 
 |   if (num_threads == 1 || end - start <= num_threads) { | 
 |     ParallelFor(context, start, end, num_threads, std::forward<F>(function)); | 
 |     return; | 
 |   } | 
 |   // Creating several partitions allows us to tolerate imperfections of | 
 |   // partitioning and user-supplied iteration costs up to a certain extent | 
 |   constexpr int kNumPartitionsPerThread = 4; | 
 |   const int kMaxPartitions = num_threads * kNumPartitionsPerThread; | 
 |   const auto& partitions = PartitionRangeForParallelFor( | 
 |       start, | 
 |       end, | 
 |       kMaxPartitions, | 
 |       cumulative_cost_data, | 
 |       std::forward<CumulativeCostFun>(cumulative_cost_fun)); | 
 |   CHECK_GT(partitions.size(), 1); | 
 |   ParallelFor( | 
 |       context, start, end, num_threads, std::forward<F>(function), partitions); | 
 | } | 
 | }  // namespace ceres::internal | 
 |  | 
 | #endif  // CERES_INTERNAL_PARALLEL_FOR_H_ |