internal/ceres/parallel_for.h - ceres-solver - Git at Google

 // Ceres Solver - A fast non-linear least squares minimizer
 // Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // * Redistributions of source code must retain the above copyright notice,
 //   this list of conditions and the following disclaimer.
 // * Redistributions in binary form must reproduce the above copyright notice,
 //   this list of conditions and the following disclaimer in the documentation
 //   and/or other materials provided with the distribution.
 // * Neither the name of Google Inc. nor the names of its contributors may be
 //   used to endorse or promote products derived from this software without
 //   specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Authors: vitus@google.com (Michael Vitus),
 //          dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)

 #ifndef CERES_INTERNAL_PARALLEL_FOR_H_
 #define CERES_INTERNAL_PARALLEL_FOR_H_

 #include <mutex>
 #include <vector>

 #include "absl/log/check.h"
 #include "ceres/context_impl.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"
 #include "ceres/parallel_invoke.h"
 #include "ceres/partition_range_for_parallel_for.h"

 namespace ceres::internal {

 // Use a dummy mutex if num_threads = 1.
 inline decltype(auto) MakeConditionalLock(const int num_threads,
                                           std::mutex& m) {
   return (num_threads == 1) ? std::unique_lock<std::mutex>{}
                             : std::unique_lock<std::mutex>{m};
 }

 // Execute the function for every element in the range [start, end) with at most
 // num_threads. It will execute all the work on the calling thread if
 // num_threads or (end - start) is equal to 1.
 // Depending on function signature, it will be supplied with either loop index
 // or a range of loop indices; function can also be supplied with thread_id.
 // The following function signatures are supported:
 //  - Functions accepting a single loop index:
 //     - [](int index) { ... }
 //     - [](int thread_id, int index) { ... }
 //  - Functions accepting a range of loop index:
 //     - [](std::tuple<int, int> index) { ... }
 //     - [](int thread_id, std::tuple<int, int> index) { ... }
 //
 // When distributing workload between threads, it is assumed that each loop
 // iteration takes approximately equal time to complete.
 template <typename F>
 void ParallelFor(ContextImpl* context,
                  int start,
                  int end,
                  int num_threads,
                  F&& function,
                  int min_block_size = 1) {
   CHECK_GT(num_threads, 0);
   if (start >= end) {
     return;
   }

   if (num_threads == 1 || end - start < min_block_size * 2) {
     InvokeOnSegment(0, std::make_tuple(start, end), std::forward<F>(function));
     return;
   }

   CHECK(context != nullptr);
   ParallelInvoke(context,
                  start,
                  end,
                  num_threads,
                  std::forward<F>(function),
                  min_block_size);
 }

 // Execute function for every element in the range [start, end) with at most
 // num_threads, using user-provided partitions array.
 // When distributing workload between threads, it is assumed that each segment
 // bounded by adjacent elements of partitions array takes approximately equal
 // time to process.
 template <typename F>
 void ParallelFor(ContextImpl* context,
                  int start,
                  int end,
                  int num_threads,
                  F&& function,
                  const std::vector<int>& partitions) {
   CHECK_GT(num_threads, 0);
   if (start >= end) {
     return;
   }
   CHECK_EQ(partitions.front(), start);
   CHECK_EQ(partitions.back(), end);
   if (num_threads == 1 || end - start <= num_threads) {
     ParallelFor(context, start, end, num_threads, std::forward<F>(function));
     return;
   }
   CHECK_GT(partitions.size(), 1);
   const int num_partitions = partitions.size() - 1;
   ParallelFor(context,
               0,
               num_partitions,
               num_threads,
               [&function, &partitions](int thread_id,
                                        std::tuple<int, int> partition_ids) {
                 // partition_ids is a range of partition indices
                 const auto [partition_start, partition_end] = partition_ids;
                 // Execution over several adjacent segments is equivalent
                 // to execution over union of those segments (which is also a
                 // contiguous segment)
                 const int range_start = partitions[partition_start];
                 const int range_end = partitions[partition_end];
                 // Range of original loop indices
                 const auto range = std::make_tuple(range_start, range_end);
                 InvokeOnSegment(thread_id, range, function);
               });
 }

 // Execute function for every element in the range [start, end) with at most
 // num_threads, taking into account user-provided integer cumulative costs of
 // iterations. Cumulative costs of iteration for indices in range [0, end) are
 // stored in objects from cumulative_cost_data. User-provided
 // cumulative_cost_fun returns non-decreasing integer values corresponding to
 // inclusive cumulative cost of loop iterations, provided with a reference to
 // user-defined object. Only indices from [start, end) will be referenced. This
 // routine assumes that cumulative_cost_fun is non-decreasing (in other words,
 // all costs are non-negative);
 // When distributing workload between threads, input range of loop indices will
 // be partitioned into disjoint contiguous intervals, with the maximal cost
 // being minimized.
 // For example, with iteration costs of [1, 1, 5, 3, 1, 4] cumulative_cost_fun
 // should return [1, 2, 7, 10, 11, 15], and with num_threads = 4 this range
 // will be split into segments [0, 2) [2, 3) [3, 5) [5, 6) with costs
 // [2, 5, 4, 4].
 template <typename F, typename CumulativeCostData, typename CumulativeCostFun>
 void ParallelFor(ContextImpl* context,
                  int start,
                  int end,
                  int num_threads,
                  F&& function,
                  const CumulativeCostData* cumulative_cost_data,
                  CumulativeCostFun&& cumulative_cost_fun) {
   CHECK_GT(num_threads, 0);
   if (start >= end) {
     return;
   }
   if (num_threads == 1 || end - start <= num_threads) {
     ParallelFor(context, start, end, num_threads, std::forward<F>(function));
     return;
   }
   // Creating several partitions allows us to tolerate imperfections of
   // partitioning and user-supplied iteration costs up to a certain extent
   constexpr int kNumPartitionsPerThread = 4;
   const int kMaxPartitions = num_threads * kNumPartitionsPerThread;
   const auto& partitions = PartitionRangeForParallelFor(
       start,
       end,
       kMaxPartitions,
       cumulative_cost_data,
       std::forward<CumulativeCostFun>(cumulative_cost_fun));
   CHECK_GT(partitions.size(), 1);
   ParallelFor(
       context, start, end, num_threads, std::forward<F>(function), partitions);
 }
 }  // namespace ceres::internal

 #endif  // CERES_INTERNAL_PARALLEL_FOR_H_
	// Ceres Solver - A fast non-linear least squares minimizer
	// Copyright 2023 Google Inc. All rights reserved.
	// http://ceres-solver.org/
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are met:
	//
	// * Redistributions of source code must retain the above copyright notice,
	// this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above copyright notice,
	// this list of conditions and the following disclaimer in the documentation
	// and/or other materials provided with the distribution.
	// * Neither the name of Google Inc. nor the names of its contributors may be
	// used to endorse or promote products derived from this software without
	// specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.
	//
	// Authors: vitus@google.com (Michael Vitus),
	// dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)

	#ifndef CERES_INTERNAL_PARALLEL_FOR_H_
	#define CERES_INTERNAL_PARALLEL_FOR_H_

	#include <mutex>
	#include <vector>

	#include "absl/log/check.h"
	#include "ceres/context_impl.h"
	#include "ceres/internal/eigen.h"
	#include "ceres/internal/export.h"
	#include "ceres/parallel_invoke.h"
	#include "ceres/partition_range_for_parallel_for.h"

	namespace ceres::internal {

	// Use a dummy mutex if num_threads = 1.
	inline decltype(auto) MakeConditionalLock(const int num_threads,
	std::mutex& m) {
	return (num_threads == 1) ? std::unique_lock<std::mutex>{}
	: std::unique_lock<std::mutex>{m};
	}

	// Execute the function for every element in the range [start, end) with at most
	// num_threads. It will execute all the work on the calling thread if
	// num_threads or (end - start) is equal to 1.
	// Depending on function signature, it will be supplied with either loop index
	// or a range of loop indices; function can also be supplied with thread_id.
	// The following function signatures are supported:
	// - Functions accepting a single loop index:
	// - [](int index) { ... }
	// - [](int thread_id, int index) { ... }
	// - Functions accepting a range of loop index:
	// - [](std::tuple<int, int> index) { ... }
	// - [](int thread_id, std::tuple<int, int> index) { ... }
	//
	// When distributing workload between threads, it is assumed that each loop
	// iteration takes approximately equal time to complete.
	template <typename F>
	void ParallelFor(ContextImpl* context,
	int start,
	int end,
	int num_threads,
	F&& function,
	int min_block_size = 1) {
	CHECK_GT(num_threads, 0);
	if (start >= end) {
	return;
	}

	if (num_threads == 1 \|\| end - start < min_block_size * 2) {
	InvokeOnSegment(0, std::make_tuple(start, end), std::forward<F>(function));
	return;
	}

	CHECK(context != nullptr);
	ParallelInvoke(context,
	start,
	end,
	num_threads,
	std::forward<F>(function),
	min_block_size);
	}

	// Execute function for every element in the range [start, end) with at most
	// num_threads, using user-provided partitions array.
	// When distributing workload between threads, it is assumed that each segment
	// bounded by adjacent elements of partitions array takes approximately equal
	// time to process.
	template <typename F>
	void ParallelFor(ContextImpl* context,
	int start,
	int end,
	int num_threads,
	F&& function,
	const std::vector<int>& partitions) {
	CHECK_GT(num_threads, 0);
	if (start >= end) {
	return;
	}
	CHECK_EQ(partitions.front(), start);
	CHECK_EQ(partitions.back(), end);
	if (num_threads == 1 \|\| end - start <= num_threads) {
	ParallelFor(context, start, end, num_threads, std::forward<F>(function));
	return;
	}
	CHECK_GT(partitions.size(), 1);
	const int num_partitions = partitions.size() - 1;
	ParallelFor(context,
	0,
	num_partitions,
	num_threads,
	[&function, &partitions](int thread_id,
	std::tuple<int, int> partition_ids) {
	// partition_ids is a range of partition indices
	const auto [partition_start, partition_end] = partition_ids;
	// Execution over several adjacent segments is equivalent
	// to execution over union of those segments (which is also a
	// contiguous segment)
	const int range_start = partitions[partition_start];
	const int range_end = partitions[partition_end];
	// Range of original loop indices
	const auto range = std::make_tuple(range_start, range_end);
	InvokeOnSegment(thread_id, range, function);
	});
	}

	// Execute function for every element in the range [start, end) with at most
	// num_threads, taking into account user-provided integer cumulative costs of
	// iterations. Cumulative costs of iteration for indices in range [0, end) are
	// stored in objects from cumulative_cost_data. User-provided
	// cumulative_cost_fun returns non-decreasing integer values corresponding to
	// inclusive cumulative cost of loop iterations, provided with a reference to
	// user-defined object. Only indices from [start, end) will be referenced. This
	// routine assumes that cumulative_cost_fun is non-decreasing (in other words,
	// all costs are non-negative);
	// When distributing workload between threads, input range of loop indices will
	// be partitioned into disjoint contiguous intervals, with the maximal cost
	// being minimized.
	// For example, with iteration costs of [1, 1, 5, 3, 1, 4] cumulative_cost_fun
	// should return [1, 2, 7, 10, 11, 15], and with num_threads = 4 this range
	// will be split into segments [0, 2) [2, 3) [3, 5) [5, 6) with costs
	// [2, 5, 4, 4].
	template <typename F, typename CumulativeCostData, typename CumulativeCostFun>
	void ParallelFor(ContextImpl* context,
	int start,
	int end,
	int num_threads,
	F&& function,
	const CumulativeCostData* cumulative_cost_data,
	CumulativeCostFun&& cumulative_cost_fun) {
	CHECK_GT(num_threads, 0);
	if (start >= end) {
	return;
	}
	if (num_threads == 1 \|\| end - start <= num_threads) {
	ParallelFor(context, start, end, num_threads, std::forward<F>(function));
	return;
	}
	// Creating several partitions allows us to tolerate imperfections of
	// partitioning and user-supplied iteration costs up to a certain extent
	constexpr int kNumPartitionsPerThread = 4;
	const int kMaxPartitions = num_threads * kNumPartitionsPerThread;
	const auto& partitions = PartitionRangeForParallelFor(
	start,
	end,
	kMaxPartitions,
	cumulative_cost_data,
	std::forward<CumulativeCostFun>(cumulative_cost_fun));
	CHECK_GT(partitions.size(), 1);
	ParallelFor(
	context, start, end, num_threads, std::forward<F>(function), partitions);
	}
	} // namespace ceres::internal

	#endif // CERES_INTERNAL_PARALLEL_FOR_H_