Parallel for with iteration costs and left product
Parallel for with user-supplied [cumulative] iteration costs allows to
get performance improvements on problems with significantly different
time requirements per parallel loop iteration.
One of those problems is left multiplication with block-sparse matrix.
Using number of non-zero values per column block, we partition column
blocks into contiguous sets with approximately equal number of
operations to be performed.
Change-Id: I4a862a10a586cdfbec22e8168a3423537039abc2
diff --git a/internal/ceres/block_sparse_matrix.cc b/internal/ceres/block_sparse_matrix.cc
index 053b3c0..f0dc11a 100644
--- a/internal/ceres/block_sparse_matrix.cc
+++ b/internal/ceres/block_sparse_matrix.cc
@@ -149,13 +149,64 @@
}
void BlockSparseMatrix::LeftMultiplyAndAccumulate(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const {
+ // While utilizing transposed structure allows to perform parallel
+ // left-multiplication by dense vector, it makes access patterns to matrix
+ // elements scattered. Thus, multiplication using transposed structure
+ // is only useful for parallel execution
+ CHECK(x != nullptr);
+ CHECK(y != nullptr);
+ if (transpose_block_structure_ == nullptr || num_threads == 1) {
+ LeftMultiplyAndAccumulate(x, y);
+ return;
+ }
+
+ auto transpose_bs = transpose_block_structure_.get();
+ const auto values = values_.get();
+ const int num_col_blocks = transpose_bs->rows.size();
+ if (!num_col_blocks) {
+ return;
+ }
+
+ // Use non-zero count as iteration cost for guided parallel-for loop
+ ParallelFor(
+ context,
+ 0,
+ num_col_blocks,
+ num_threads,
+ [values, transpose_bs, x, y](int row_block_id) {
+ int row_block_pos = transpose_bs->rows[row_block_id].block.position;
+ int row_block_size = transpose_bs->rows[row_block_id].block.size;
+ auto& cells = transpose_bs->rows[row_block_id].cells;
+
+ for (auto& cell : cells) {
+ const int col_block_id = cell.block_id;
+ const int col_block_size = transpose_bs->cols[col_block_id].size;
+ const int col_block_pos = transpose_bs->cols[col_block_id].position;
+ MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+ values + cell.position,
+ col_block_size,
+ row_block_size,
+ x + col_block_pos,
+ y + row_block_pos);
+ }
+ },
+ transpose_bs->rows.data(),
+ [](const CompressedRow& row) { return row.cumulative_nnz; });
+}
+
+void BlockSparseMatrix::LeftMultiplyAndAccumulate(const double* x,
double* y) const {
CHECK(x != nullptr);
CHECK(y != nullptr);
+ // Single-threaded left products are always computed using a non-transpose
+ // block structure, because it has linear acess pattern to matrix elements
for (int i = 0; i < block_structure_->rows.size(); ++i) {
int row_block_pos = block_structure_->rows[i].block.position;
int row_block_size = block_structure_->rows[i].block.size;
- auto& cells = block_structure_->rows[i].cells;
+ const auto& cells = block_structure_->rows[i].cells;
for (const auto& cell : cells) {
int col_block_id = cell.block_id;
int col_block_size = block_structure_->cols[col_block_id].size;
diff --git a/internal/ceres/block_sparse_matrix.h b/internal/ceres/block_sparse_matrix.h
index b4abdc7..72b1d68 100644
--- a/internal/ceres/block_sparse_matrix.h
+++ b/internal/ceres/block_sparse_matrix.h
@@ -78,6 +78,10 @@
ContextImpl* context,
int num_threads) const final;
void LeftMultiplyAndAccumulate(const double* x, double* y) const final;
+ void LeftMultiplyAndAccumulate(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const final;
void SquaredColumnNorm(double* x) const final;
void ScaleColumns(const double* scale) final;
void ToCompressedRowSparseMatrix(CompressedRowSparseMatrix* matrix) const;
diff --git a/internal/ceres/block_sparse_matrix_test.cc b/internal/ceres/block_sparse_matrix_test.cc
index cfed041..6533a80 100644
--- a/internal/ceres/block_sparse_matrix_test.cc
+++ b/internal/ceres/block_sparse_matrix_test.cc
@@ -186,6 +186,22 @@
}
}
+TEST_F(BlockSparseMatrixTest, LeftMultiplyAndAccumulateParallelTest) {
+ Vector y_0 = Vector::Random(A_->num_rows());
+ Vector y_s = y_0;
+ Vector y_p = y_0;
+
+ Vector x = Vector::Random(A_->num_cols());
+ A_->LeftMultiplyAndAccumulate(x.data(), y_s.data());
+
+ A_->AddTransposeBlockStructure();
+ A_->LeftMultiplyAndAccumulate(x.data(), y_p.data(), &context_, kNumThreads);
+
+ // Parallel implementation for left products uses a different order of
+ // traversal, thus results might be different
+ EXPECT_LT((y_s - y_p).norm(), 1e-12);
+}
+
TEST_F(BlockSparseMatrixTest, SquaredColumnNormTest) {
Vector y_a = Vector::Zero(A_->num_cols());
Vector y_b = Vector::Zero(A_->num_cols());
diff --git a/internal/ceres/cgnr_solver.cc b/internal/ceres/cgnr_solver.cc
index e1c571d..524356a 100644
--- a/internal/ceres/cgnr_solver.cc
+++ b/internal/ceres/cgnr_solver.cc
@@ -99,7 +99,7 @@
// y = y + Atz
z_.setZero();
A_.RightMultiplyAndAccumulate(x, z_, context_, num_threads_);
- A_.LeftMultiplyAndAccumulate(z_, y);
+ A_.LeftMultiplyAndAccumulate(z_, y, context_, num_threads_);
// y = y + DtDx
if (D_ != nullptr) {
@@ -165,7 +165,9 @@
preconditioner_ = std::make_unique<IdentityPreconditioner>(A->num_cols());
}
}
-
+ if (options_.num_threads > 1) {
+ A->AddTransposeBlockStructure();
+ }
preconditioner_->Update(*A, per_solve_options.D);
ConjugateGradientsSolverOptions cg_options;
@@ -181,7 +183,8 @@
// rhs = Atb.
Vector rhs(A->num_cols());
rhs.setZero();
- A->LeftMultiplyAndAccumulate(b, rhs.data());
+ A->LeftMultiplyAndAccumulate(
+ b, rhs.data(), options_.context, options_.num_threads);
cg_solution_ = Vector::Zero(A->num_cols());
for (int i = 0; i < 4; ++i) {
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index b18f563..56b4f19 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc
@@ -116,6 +116,16 @@
return block_sparse_jacobian.get();
}
+ const BlockSparseMatrix* BlockSparseJacobianWithTranspose(
+ ContextImpl* context) {
+ if (!block_sparse_jacobian_with_transpose) {
+ auto regular = BlockSparseJacobian(context);
+ block_sparse_jacobian_with_transpose = CreateBlockSparseJacobian(context);
+ block_sparse_jacobian_with_transpose->AddTransposeBlockStructure();
+ }
+ return block_sparse_jacobian_with_transpose.get();
+ }
+
const CompressedRowSparseMatrix* CompressedRowSparseJacobian(
ContextImpl* context) {
if (!crs_jacobian) {
@@ -127,6 +137,7 @@
Vector parameters;
std::unique_ptr<BundleAdjustmentProblem> bal_problem;
std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian;
+ std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian_with_transpose;
std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian;
};
@@ -217,13 +228,32 @@
static void JacobianLeftMultiplyAndAccumulate(benchmark::State& state,
BALData* data,
ContextImpl* context) {
+ const int num_threads = state.range(0);
+
auto jacobian = data->BlockSparseJacobian(context);
Vector y = Vector::Zero(jacobian->num_cols());
Vector x = Vector::Random(jacobian->num_rows());
for (auto _ : state) {
- jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
+ jacobian->LeftMultiplyAndAccumulate(
+ x.data(), y.data(), context, num_threads);
+ }
+ CHECK_GT(y.squaredNorm(), 0.);
+}
+
+static void JacobianLeftMultiplyAndAccumulateWithTranspose(
+ benchmark::State& state, BALData* data, ContextImpl* context) {
+ const int num_threads = state.range(0);
+
+ auto jacobian = data->BlockSparseJacobianWithTranspose(context);
+
+ Vector y = Vector::Zero(jacobian->num_cols());
+ Vector x = Vector::Random(jacobian->num_rows());
+
+ for (auto _ : state) {
+ jacobian->LeftMultiplyAndAccumulate(
+ x.data(), y.data(), context, num_threads);
}
CHECK_GT(y.squaredNorm(), 0.);
}
@@ -362,11 +392,25 @@
&context)
->Arg(1);
+ const std::string name_left_product_transpose =
+ "JacobianLeftMultiplyAndAccumulateWithTranspose<" + path + ">";
+ ::benchmark::RegisterBenchmark(
+ name_left_product_transpose.c_str(),
+ ceres::internal::JacobianLeftMultiplyAndAccumulateWithTranspose,
+ data,
+ &context)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16)
+ ->Arg(28);
+
#ifndef CERES_NO_CUDA
const std::string name_left_product_cuda =
"JacobianLeftMultiplyAndAccumulateCuda<" + path + ">";
::benchmark::RegisterBenchmark(
- name_right_product_cuda.c_str(),
+ name_left_product_cuda.c_str(),
ceres::internal::JacobianLeftMultiplyAndAccumulateCuda,
data,
&context)
diff --git a/internal/ceres/parallel_for.h b/internal/ceres/parallel_for.h
index 234c7db..107d19c 100644
--- a/internal/ceres/parallel_for.h
+++ b/internal/ceres/parallel_for.h
@@ -32,6 +32,7 @@
#ifndef CERES_INTERNAL_PARALLEL_FOR_H_
#define CERES_INTERNAL_PARALLEL_FOR_H_
+#include <algorithm>
#include <functional>
#include <mutex>
@@ -93,6 +94,124 @@
void Invoke(int thread_id, int i, const F& function) {
InvokeImpl<F, args_of_t<F>>::Invoke(thread_id, i, function);
}
+
+// Check if it is possible to split range [start; end) into at most
+// max_num_partitions contiguous partitions of cost not greater than
+// max_partition_cost. Inclusive integer cumulative costs are provided by
+// cumulative_cost_data objects, with cumulative_cost_offset being a total cost
+// of all indices (starting from zero) preceding start element. Cumulative costs
+// are returned by cumulative_cost_fun called with a reference to
+// cumulative_cost_data element with index from range[start; end), and should be
+// non-decreasing. Partition of the range is returned via partition argument
+template <typename CumulativeCostData, typename CumulativeCostFun>
+bool MaxPartitionCostIsFeasible(int start,
+ int end,
+ int max_num_partitions,
+ int max_partition_cost,
+ int cumulative_cost_offset,
+ const CumulativeCostData* cumulative_cost_data,
+ const CumulativeCostFun& cumulative_cost_fun,
+ std::vector<int>* partition) {
+ partition->clear();
+ partition->push_back(start);
+ int partition_start = start;
+ int cost_offset = cumulative_cost_offset;
+
+ const CumulativeCostData* const range_end = cumulative_cost_data + end;
+ while (partition_start < end) {
+ // Already have max_num_partitions
+ if (partition->size() > max_num_partitions) {
+ return false;
+ }
+ const int target = max_partition_cost + cost_offset;
+ const int partition_end =
+ std::partition_point(
+ cumulative_cost_data + partition_start,
+ cumulative_cost_data + end,
+ [&cumulative_cost_fun, target](const CumulativeCostData& item) {
+ return cumulative_cost_fun(item) <= target;
+ }) -
+ cumulative_cost_data;
+ // Unable to make a partition from a single element
+ if (partition_end == partition_start) {
+ return false;
+ }
+
+ const int cost_last =
+ cumulative_cost_fun(cumulative_cost_data[partition_end - 1]);
+ partition->push_back(partition_end);
+ partition_start = partition_end;
+ cost_offset = cost_last;
+ }
+ return true;
+}
+
+// Split integer interval [start, end) into at most max_num_partitions
+// contiguous intervals, minimizing maximal total cost of a single interval.
+// Inclusive integer cumulative costs for each (zero-based) index are provided
+// by cumulative_cost_data objects, and are returned by cumulative_cost_fun call
+// with a reference to one of the objects from range [start, end)
+template <typename CumulativeCostData, typename CumulativeCostFun>
+std::vector<int> ComputePartition(
+ int start,
+ int end,
+ int max_num_partitions,
+ const CumulativeCostData* cumulative_cost_data,
+ const CumulativeCostFun& cumulative_cost_fun) {
+ // Given maximal partition cost, it is possible to verify if it is admissible
+ // and obtain corresponding partition using MaxPartitionCostIsFeasible
+ // function. In order to find the lowest admissible value, a binary search
+ // over all potentially optimal cost values is being performed
+ const int cumulative_cost_last =
+ cumulative_cost_fun(cumulative_cost_data[end - 1]);
+ const int cumulative_cost_offset =
+ start ? cumulative_cost_fun(cumulative_cost_data[start - 1]) : 0;
+ const int total_cost = cumulative_cost_last - cumulative_cost_offset;
+
+ // Minimal maximal partition cost is not smaller than the average
+ // We will use non-inclusive lower bound
+ int partition_cost_lower_bound = total_cost / max_num_partitions - 1;
+ // Minimal maximal partition cost is not larger than the total cost
+ // Upper bound is inclusive
+ int partition_cost_upper_bound = total_cost;
+
+ std::vector<int> partition, partition_upper_bound;
+ // Binary search over partition cost, returning the lowest admissible cost
+ while (partition_cost_upper_bound - partition_cost_lower_bound > 1) {
+ partition.reserve(max_num_partitions + 1);
+ const int partition_cost =
+ partition_cost_lower_bound +
+ (partition_cost_upper_bound - partition_cost_lower_bound) / 2;
+ bool admissible = MaxPartitionCostIsFeasible(start,
+ end,
+ max_num_partitions,
+ partition_cost,
+ cumulative_cost_offset,
+ cumulative_cost_data,
+ cumulative_cost_fun,
+ &partition);
+ if (admissible) {
+ partition_cost_upper_bound = partition_cost;
+ std::swap(partition, partition_upper_bound);
+ } else {
+ partition_cost_lower_bound = partition_cost;
+ }
+ }
+
+ // After binary search over partition cost, interval
+ // (partition_cost_lower_bound, partition_cost_upper_bound] contains the only
+ // admissible partition cost value - partition_cost_upper_bound
+ //
+ // Partition for this cost value might have been already computed
+ if (partition_upper_bound.empty() == false) {
+ return partition_upper_bound;
+ }
+ // Partition for upper bound is not computed if and only if upper bound was
+ // never updated This is a simple case of a single interval containing all
+ // values, which we were not able to break into pieces
+ partition = {start, end};
+ return partition;
+}
} // namespace parallel_for_details
// Forward declaration of parallel invocation function that is to be
@@ -131,6 +250,55 @@
CHECK(context != nullptr);
ParallelInvoke<F>(context, start, end, num_threads, function);
}
+
+// Execute function for every element in the range [start, end) with at most
+// num_threads, taking into account user-provided integer cumulative costs of
+// iterations. Cumulative costs of iteration for indices in range [0, end) are
+// stored in objects from cumulative_cost_data. User-provided
+// cumulative_cost_fun returns non-decreasing integer values corresponding to
+// inclusive cumulative cost of loop iterations, provided with a reference to
+// user-defined object. Only indices from [start, end) will be referenced. This
+// routine assumes that cumulative_cost_fun is non-decreasing (in other words,
+// all costs are non-negative);
+template <typename F, typename CumulativeCostData, typename CumulativeCostFun>
+void ParallelFor(ContextImpl* context,
+ int start,
+ int end,
+ int num_threads,
+ const F& function,
+ const CumulativeCostData* cumulative_cost_data,
+ const CumulativeCostFun& cumulative_cost_fun) {
+ using namespace parallel_for_details;
+ CHECK_GT(num_threads, 0);
+ if (start >= end) {
+ return;
+ }
+ if (num_threads == 1 || end - start <= num_threads) {
+ ParallelFor(context, start, end, num_threads, function);
+ return;
+ }
+ // Creating several partitions allows us to tolerate imperfections of
+ // partitioning and user-supplied iteration costs up to a certain extent
+ const int kNumPartitionsPerThread = 4;
+ const int kMaxPartitions = num_threads * kNumPartitionsPerThread;
+ const std::vector<int> partitions = ComputePartition(
+ start, end, kMaxPartitions, cumulative_cost_data, cumulative_cost_fun);
+ CHECK_GT(partitions.size(), 1);
+ const int num_partitions = partitions.size() - 1;
+ ParallelFor(context,
+ 0,
+ num_partitions,
+ num_threads,
+ [&function, &partitions](int thread_id, int partition_id) {
+ const int partition_start = partitions[partition_id];
+ const int partition_end = partitions[partition_id + 1];
+
+ for (int i = partition_start; i < partition_end; ++i) {
+ Invoke<F>(thread_id, i, function);
+ }
+ });
+}
+
} // namespace ceres::internal
// Backend-specific implementations of ParallelInvoke
diff --git a/internal/ceres/parallel_for_test.cc b/internal/ceres/parallel_for_test.cc
index 451d80e..b74165e 100644
--- a/internal/ceres/parallel_for_test.cc
+++ b/internal/ceres/parallel_for_test.cc
@@ -38,6 +38,8 @@
#include <cmath>
#include <condition_variable>
#include <mutex>
+#include <numeric>
+#include <random>
#include <thread>
#include <vector>
@@ -166,4 +168,251 @@
}
#endif // CERES_NO_THREADS
+// Helper function for partition tests
+bool BruteForcePartition(
+ int* costs, int start, int end, int max_cost, int max_partitions);
+// Basic test if MaxPartitionCostIsFeasible and BruteForcePartition agree on
+// simple test-cases
+TEST(GuidedParallelFor, MaxPartitionCostIsFeasible) {
+ using namespace parallel_for_details;
+ std::vector<int> costs, cumulative_costs, partition;
+ costs = {1, 2, 3, 5, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0};
+ cumulative_costs.resize(costs.size());
+ std::partial_sum(costs.begin(), costs.end(), cumulative_costs.begin());
+ const auto dummy_getter = [](const int v) { return v; };
+
+ // [1, 2, 3] [5], [0 ... 0, 7, 0, ... 0]
+ EXPECT_TRUE(MaxPartitionCostIsFeasible(0,
+ costs.size(),
+ 3,
+ 7,
+ 0,
+ cumulative_costs.data(),
+ dummy_getter,
+ &partition));
+ EXPECT_TRUE(BruteForcePartition(costs.data(), 0, costs.size(), 3, 7));
+ // [1, 2, 3, 5, 0 ... 0, 7, 0, ... 0]
+ EXPECT_TRUE(MaxPartitionCostIsFeasible(0,
+ costs.size(),
+ 3,
+ 18,
+ 0,
+ cumulative_costs.data(),
+ dummy_getter,
+ &partition));
+ EXPECT_TRUE(BruteForcePartition(costs.data(), 0, costs.size(), 3, 18));
+ // Impossible since there is item of cost 7
+ EXPECT_FALSE(MaxPartitionCostIsFeasible(0,
+ costs.size(),
+ 3,
+ 6,
+ 0,
+ cumulative_costs.data(),
+ dummy_getter,
+ &partition));
+ EXPECT_FALSE(BruteForcePartition(costs.data(), 0, costs.size(), 3, 6));
+ // Impossible
+ EXPECT_FALSE(MaxPartitionCostIsFeasible(0,
+ costs.size(),
+ 2,
+ 10,
+ 0,
+ cumulative_costs.data(),
+ dummy_getter,
+ &partition));
+ EXPECT_FALSE(BruteForcePartition(costs.data(), 0, costs.size(), 2, 10));
+}
+
+// Randomized tests for MaxPartitionCostIsFeasible
+TEST(GuidedParallelFor, MaxPartitionCostIsFeasibleRandomized) {
+ using namespace parallel_for_details;
+ std::vector<int> costs, cumulative_costs, partition;
+ const auto dummy_getter = [](const int v) { return v; };
+
+ // Random tests
+ const int kNumTests = 1000;
+ const int kMaxElements = 32;
+ const int kMaxPartitions = 16;
+ const int kMaxElCost = 8;
+ std::mt19937 rng;
+ std::uniform_int_distribution<int> rng_N(1, kMaxElements);
+ std::uniform_int_distribution<int> rng_M(1, kMaxPartitions);
+ std::uniform_int_distribution<int> rng_e(0, kMaxElCost);
+ for (int t = 0; t < kNumTests; ++t) {
+ const int N = rng_N(rng);
+ const int M = rng_M(rng);
+ int total = 0;
+ costs.clear();
+ for (int i = 0; i < N; ++i) {
+ costs.push_back(rng_e(rng));
+ total += costs.back();
+ }
+
+ cumulative_costs.resize(N);
+ std::partial_sum(costs.begin(), costs.end(), cumulative_costs.begin());
+
+ std::uniform_int_distribution<int> rng_seg(0, N - 1);
+ int start = rng_seg(rng);
+ int end = rng_seg(rng);
+ if (start > end) std::swap(start, end);
+ ++end;
+
+ int first_admissible = 0;
+ for (int threshold = 1; threshold <= total; ++threshold) {
+ const bool bruteforce =
+ BruteForcePartition(costs.data(), start, end, M, threshold);
+ if (bruteforce && !first_admissible) {
+ first_admissible = threshold;
+ }
+ const bool binary_search =
+ MaxPartitionCostIsFeasible(start,
+ end,
+ M,
+ threshold,
+ start ? cumulative_costs[start - 1] : 0,
+ cumulative_costs.data(),
+ dummy_getter,
+ &partition);
+ EXPECT_EQ(bruteforce, binary_search);
+ EXPECT_LE(partition.size(), M + 1);
+ // check partition itself
+ if (binary_search) {
+ ASSERT_GT(partition.size(), 1);
+ EXPECT_EQ(partition.front(), start);
+ EXPECT_EQ(partition.back(), end);
+
+ const int num_partitions = partition.size() - 1;
+ EXPECT_LE(num_partitions, M);
+ for (int j = 0; j < num_partitions; ++j) {
+ int total = 0;
+ for (int k = partition[j]; k < partition[j + 1]; ++k) {
+ EXPECT_LT(k, end);
+ EXPECT_GE(k, start);
+ total += costs[k];
+ }
+ EXPECT_LE(total, threshold);
+ }
+ }
+ }
+ }
+}
+
+// Randomized tests for CreatePartition
+TEST(GuidedParallelFor, CreatePartition) {
+ using namespace parallel_for_details;
+ std::vector<int> costs, cumulative_costs, partition;
+ const auto dummy_getter = [](const int v) { return v; };
+
+ // Random tests
+ const int kNumTests = 1000;
+ const int kMaxElements = 32;
+ const int kMaxPartitions = 16;
+ const int kMaxElCost = 8;
+ std::mt19937 rng;
+ std::uniform_int_distribution<int> rng_N(1, kMaxElements);
+ std::uniform_int_distribution<int> rng_M(1, kMaxPartitions);
+ std::uniform_int_distribution<int> rng_e(0, kMaxElCost);
+ for (int t = 0; t < kNumTests; ++t) {
+ const int N = rng_N(rng);
+ const int M = rng_M(rng);
+ int total = 0;
+ costs.clear();
+ for (int i = 0; i < N; ++i) {
+ costs.push_back(rng_e(rng));
+ total += costs.back();
+ }
+
+ cumulative_costs.resize(N);
+ std::partial_sum(costs.begin(), costs.end(), cumulative_costs.begin());
+
+ std::uniform_int_distribution<int> rng_seg(0, N - 1);
+ int start = rng_seg(rng);
+ int end = rng_seg(rng);
+ if (start > end) std::swap(start, end);
+ ++end;
+
+ int first_admissible = 0;
+ for (int threshold = 1; threshold <= total; ++threshold) {
+ const bool bruteforce =
+ BruteForcePartition(costs.data(), start, end, M, threshold);
+ if (bruteforce) {
+ first_admissible = threshold;
+ break;
+ }
+ }
+ EXPECT_TRUE(first_admissible != 0 || total == 0);
+ partition =
+ ComputePartition(start, end, M, cumulative_costs.data(), dummy_getter);
+ ASSERT_GT(partition.size(), 1);
+ EXPECT_EQ(partition.front(), start);
+ EXPECT_EQ(partition.back(), end);
+
+ const int num_partitions = partition.size() - 1;
+ EXPECT_LE(num_partitions, M);
+ for (int j = 0; j < num_partitions; ++j) {
+ int total = 0;
+ for (int k = partition[j]; k < partition[j + 1]; ++k) {
+ EXPECT_LT(k, end);
+ EXPECT_GE(k, start);
+ total += costs[k];
+ }
+ EXPECT_LE(total, first_admissible);
+ }
+ }
+}
+
+// Recursively try to partition range into segements of total cost
+// less than max_cost
+bool BruteForcePartition(
+ int* costs, int start, int end, int max_partitions, int max_cost) {
+ if (start == end) return true;
+ if (start < end && max_partitions == 0) return false;
+ int total_cost = 0;
+ for (int last_curr = start + 1; last_curr <= end; ++last_curr) {
+ total_cost += costs[last_curr - 1];
+ if (total_cost > max_cost) break;
+ if (BruteForcePartition(
+ costs, last_curr, end, max_partitions - 1, max_cost))
+ return true;
+ }
+ return false;
+}
+
+// Tests if guided parallel for loop computes the correct result for various
+// number of threads.
+TEST(GuidedParallelFor, NumThreads) {
+ ContextImpl context;
+ context.EnsureMinimumThreads(/*num_threads=*/2);
+
+ const int size = 16;
+ std::vector<int> expected_results(size, 0);
+ for (int i = 0; i < size; ++i) {
+ expected_results[i] = std::sqrt(i);
+ }
+
+ std::vector<int> costs, cumulative_costs;
+ for (int i = 1; i <= size; ++i) {
+ int cost = i * i;
+ costs.push_back(cost);
+ if (i == 1) {
+ cumulative_costs.push_back(cost);
+ } else {
+ cumulative_costs.push_back(cost + cumulative_costs.back());
+ }
+ }
+
+ for (int num_threads = 1; num_threads <= 8; ++num_threads) {
+ std::vector<int> values(size, 0);
+ ParallelFor(
+ &context,
+ 0,
+ size,
+ num_threads,
+ [&values](int i) { values[i] = std::sqrt(i); },
+ cumulative_costs.data(),
+ [](const int v) { return v; });
+ EXPECT_THAT(values, ElementsAreArray(expected_results));
+ }
+}
+
} // namespace ceres::internal