|  | // Ceres Solver - A fast non-linear least squares minimizer | 
|  | // Copyright 2023 Google Inc. All rights reserved. | 
|  | // http://ceres-solver.org/ | 
|  | // | 
|  | // Redistribution and use in source and binary forms, with or without | 
|  | // modification, are permitted provided that the following conditions are met: | 
|  | // | 
|  | // * Redistributions of source code must retain the above copyright notice, | 
|  | //   this list of conditions and the following disclaimer. | 
|  | // * Redistributions in binary form must reproduce the above copyright notice, | 
|  | //   this list of conditions and the following disclaimer in the documentation | 
|  | //   and/or other materials provided with the distribution. | 
|  | // * Neither the name of Google Inc. nor the names of its contributors may be | 
|  | //   used to endorse or promote products derived from this software without | 
|  | //   specific prior written permission. | 
|  | // | 
|  | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
|  | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
|  | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
|  | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
|  | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
|  | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
|  | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
|  | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
|  | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
|  | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
|  | // POSSIBILITY OF SUCH DAMAGE. | 
|  | // | 
|  | // Authors: vitus@google.com (Michael Vitus), | 
|  | //          dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) | 
|  |  | 
|  | #ifndef CERES_INTERNAL_PARALLEL_FOR_H_ | 
|  | #define CERES_INTERNAL_PARALLEL_FOR_H_ | 
|  |  | 
|  | #include <mutex> | 
|  | #include <vector> | 
|  |  | 
|  | #include "ceres/context_impl.h" | 
|  | #include "ceres/internal/eigen.h" | 
|  | #include "ceres/internal/export.h" | 
|  | #include "ceres/parallel_invoke.h" | 
|  | #include "ceres/partition_range_for_parallel_for.h" | 
|  | #include "glog/logging.h" | 
|  |  | 
|  | namespace ceres::internal { | 
|  |  | 
|  | // Use a dummy mutex if num_threads = 1. | 
|  | inline decltype(auto) MakeConditionalLock(const int num_threads, | 
|  | std::mutex& m) { | 
|  | return (num_threads == 1) ? std::unique_lock<std::mutex>{} | 
|  | : std::unique_lock<std::mutex>{m}; | 
|  | } | 
|  |  | 
|  | // Execute the function for every element in the range [start, end) with at most | 
|  | // num_threads. It will execute all the work on the calling thread if | 
|  | // num_threads or (end - start) is equal to 1. | 
|  | // Depending on function signature, it will be supplied with either loop index | 
|  | // or a range of loop indicies; function can also be supplied with thread_id. | 
|  | // The following function signatures are supported: | 
|  | //  - Functions accepting a single loop index: | 
|  | //     - [](int index) { ... } | 
|  | //     - [](int thread_id, int index) { ... } | 
|  | //  - Functions accepting a range of loop index: | 
|  | //     - [](std::tuple<int, int> index) { ... } | 
|  | //     - [](int thread_id, std::tuple<int, int> index) { ... } | 
|  | // | 
|  | // When distributing workload between threads, it is assumed that each loop | 
|  | // iteration takes approximately equal time to complete. | 
|  | template <typename F> | 
|  | void ParallelFor(ContextImpl* context, | 
|  | int start, | 
|  | int end, | 
|  | int num_threads, | 
|  | F&& function, | 
|  | int min_block_size = 1) { | 
|  | CHECK_GT(num_threads, 0); | 
|  | if (start >= end) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | if (num_threads == 1 || end - start < min_block_size * 2) { | 
|  | InvokeOnSegment(0, std::make_tuple(start, end), std::forward<F>(function)); | 
|  | return; | 
|  | } | 
|  |  | 
|  | CHECK(context != nullptr); | 
|  | ParallelInvoke(context, | 
|  | start, | 
|  | end, | 
|  | num_threads, | 
|  | std::forward<F>(function), | 
|  | min_block_size); | 
|  | } | 
|  |  | 
|  | // Execute function for every element in the range [start, end) with at most | 
|  | // num_threads, using user-provided partitions array. | 
|  | // When distributing workload between threads, it is assumed that each segment | 
|  | // bounded by adjacent elements of partitions array takes approximately equal | 
|  | // time to process. | 
|  | template <typename F> | 
|  | void ParallelFor(ContextImpl* context, | 
|  | int start, | 
|  | int end, | 
|  | int num_threads, | 
|  | F&& function, | 
|  | const std::vector<int>& partitions) { | 
|  | CHECK_GT(num_threads, 0); | 
|  | if (start >= end) { | 
|  | return; | 
|  | } | 
|  | CHECK_EQ(partitions.front(), start); | 
|  | CHECK_EQ(partitions.back(), end); | 
|  | if (num_threads == 1 || end - start <= num_threads) { | 
|  | ParallelFor(context, start, end, num_threads, std::forward<F>(function)); | 
|  | return; | 
|  | } | 
|  | CHECK_GT(partitions.size(), 1); | 
|  | const int num_partitions = partitions.size() - 1; | 
|  | ParallelFor(context, | 
|  | 0, | 
|  | num_partitions, | 
|  | num_threads, | 
|  | [&function, &partitions](int thread_id, | 
|  | std::tuple<int, int> partition_ids) { | 
|  | // partition_ids is a range of partition indices | 
|  | const auto [partition_start, partition_end] = partition_ids; | 
|  | // Execution over several adjacent segments is equivalent | 
|  | // to execution over union of those segments (which is also a | 
|  | // contiguous segment) | 
|  | const int range_start = partitions[partition_start]; | 
|  | const int range_end = partitions[partition_end]; | 
|  | // Range of original loop indices | 
|  | const auto range = std::make_tuple(range_start, range_end); | 
|  | InvokeOnSegment(thread_id, range, function); | 
|  | }); | 
|  | } | 
|  |  | 
|  | // Execute function for every element in the range [start, end) with at most | 
|  | // num_threads, taking into account user-provided integer cumulative costs of | 
|  | // iterations. Cumulative costs of iteration for indices in range [0, end) are | 
|  | // stored in objects from cumulative_cost_data. User-provided | 
|  | // cumulative_cost_fun returns non-decreasing integer values corresponding to | 
|  | // inclusive cumulative cost of loop iterations, provided with a reference to | 
|  | // user-defined object. Only indices from [start, end) will be referenced. This | 
|  | // routine assumes that cumulative_cost_fun is non-decreasing (in other words, | 
|  | // all costs are non-negative); | 
|  | // When distributing workload between threads, input range of loop indices will | 
|  | // be partitioned into disjoint contiguous intervals, with the maximal cost | 
|  | // being minimized. | 
|  | // For example, with iteration costs of [1, 1, 5, 3, 1, 4] cumulative_cost_fun | 
|  | // should return [1, 2, 7, 10, 11, 15], and with num_threads = 4 this range | 
|  | // will be split into segments [0, 2) [2, 3) [3, 5) [5, 6) with costs | 
|  | // [2, 5, 4, 4]. | 
|  | template <typename F, typename CumulativeCostData, typename CumulativeCostFun> | 
|  | void ParallelFor(ContextImpl* context, | 
|  | int start, | 
|  | int end, | 
|  | int num_threads, | 
|  | F&& function, | 
|  | const CumulativeCostData* cumulative_cost_data, | 
|  | CumulativeCostFun&& cumulative_cost_fun) { | 
|  | CHECK_GT(num_threads, 0); | 
|  | if (start >= end) { | 
|  | return; | 
|  | } | 
|  | if (num_threads == 1 || end - start <= num_threads) { | 
|  | ParallelFor(context, start, end, num_threads, std::forward<F>(function)); | 
|  | return; | 
|  | } | 
|  | // Creating several partitions allows us to tolerate imperfections of | 
|  | // partitioning and user-supplied iteration costs up to a certain extent | 
|  | constexpr int kNumPartitionsPerThread = 4; | 
|  | const int kMaxPartitions = num_threads * kNumPartitionsPerThread; | 
|  | const auto& partitions = PartitionRangeForParallelFor( | 
|  | start, | 
|  | end, | 
|  | kMaxPartitions, | 
|  | cumulative_cost_data, | 
|  | std::forward<CumulativeCostFun>(cumulative_cost_fun)); | 
|  | CHECK_GT(partitions.size(), 1); | 
|  | ParallelFor( | 
|  | context, start, end, num_threads, std::forward<F>(function), partitions); | 
|  | } | 
|  | }  // namespace ceres::internal | 
|  |  | 
|  | #endif  // CERES_INTERNAL_PARALLEL_FOR_H_ |