Convert a nested loop into a linear loop in covariance_impl.h
This improves the readability and simplifies the logic
for interfacing with ParallelFor. More importantly, it paves the
way for ParallelFor refactoring to improve its performance.
Change-Id: I13b05596228900ee00d71f2ccce1db338844b9ab
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc
index f7c7126..8b34bc1 100644
--- a/internal/ceres/covariance_impl.cc
+++ b/internal/ceres/covariance_impl.cc
@@ -344,49 +344,34 @@
ThreadTokenProvider thread_token_provider(num_threads);
-#ifdef CERES_USE_OPENMP
-// The collapse() directive is only supported in OpenMP 3.0 and higher. OpenMP
-// 3.0 was released in May 2008 (hence the version number).
-# if _OPENMP >= 200805
-# pragma omp parallel for num_threads(num_threads) schedule(dynamic) collapse(2)
-# else
+ // Technically the following code is a double nested loop where
+ // i = 1:n, j = i:n.
+ //
+ // To make things easier to parallelize, we instead iterate over k =
+ // 1:n^2, compute i * j ,and skip over the lower triangular part of
+ // the loop.
+#if defined(CERES_USE_OPENMP)
# pragma omp parallel for num_threads(num_threads) schedule(dynamic)
-# endif
- for (int i = 0; i < num_parameters; ++i) {
- for (int j = 0; j < num_parameters; ++j) {
- // The second loop can't start from j = i for compatibility with OpenMP
- // collapse command. The conditional serves as a workaround
- if (j < i) {
- continue;
- }
#endif // CERES_USE_OPENMP
-
-#ifdef CERES_NO_THREADS
- for (int i = 0; i < num_parameters; ++i) {
- for (int j = i; j < num_parameters; ++j) {
-#endif // CERES_NO_THREADS
-
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-
- // The parallel for abstraction does not have support for constraining the
- // number of workers in nested parallel for loops. Consequently, we will try
- // to evenly distribute the number of workers between the each parallel for
- // loop.
- // TODO(vitus): consolidate the nested for loops into a single loop which can
- // be properly split between the threads.
+#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
+ for (int k = 0; k < num_parameters * num_parameters; ++k) {
+#else
problem_->context()->EnsureMinimumThreads(num_threads);
- const int num_outer_threads = std::sqrt(num_threads);
- const int num_inner_threads = num_threads / num_outer_threads;
ParallelFor(problem_->context(),
0,
- num_parameters,
- num_outer_threads,
- [&](int i) {
- ParallelFor(problem_->context(), i,
- num_parameters,
- num_inner_threads,
- [&](int j) {
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+ num_parameters * num_parameters,
+ num_threads,
+ [&](int k) {
+#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
+ int i = k / num_parameters;
+ int j = k % num_parameters;
+ if (j < i) {
+#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+ return;
+#else
+ continue;
+#endif
+ }
int covariance_row_idx = cum_parameter_size[i];
int covariance_col_idx = cum_parameter_size[j];
@@ -415,10 +400,7 @@
}
}
#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
- );
- });
-#else
- }
+ );
#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
return success;
}