Convert a nested loop into a linear loop in covariance_impl.h This improves the readability and simplifies the logic for interfacing with ParallelFor. More importantly, it paves the way for ParallelFor refactoring to improve its performance. Change-Id: I13b05596228900ee00d71f2ccce1db338844b9ab
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc index f7c7126..8b34bc1 100644 --- a/internal/ceres/covariance_impl.cc +++ b/internal/ceres/covariance_impl.cc
@@ -344,49 +344,34 @@ ThreadTokenProvider thread_token_provider(num_threads); -#ifdef CERES_USE_OPENMP -// The collapse() directive is only supported in OpenMP 3.0 and higher. OpenMP -// 3.0 was released in May 2008 (hence the version number). -# if _OPENMP >= 200805 -# pragma omp parallel for num_threads(num_threads) schedule(dynamic) collapse(2) -# else + // Technically the following code is a double nested loop where + // i = 1:n, j = i:n. + // + // To make things easier to parallelize, we instead iterate over k = + // 1:n^2, compute i * j ,and skip over the lower triangular part of + // the loop. +#if defined(CERES_USE_OPENMP) # pragma omp parallel for num_threads(num_threads) schedule(dynamic) -# endif - for (int i = 0; i < num_parameters; ++i) { - for (int j = 0; j < num_parameters; ++j) { - // The second loop can't start from j = i for compatibility with OpenMP - // collapse command. The conditional serves as a workaround - if (j < i) { - continue; - } #endif // CERES_USE_OPENMP - -#ifdef CERES_NO_THREADS - for (int i = 0; i < num_parameters; ++i) { - for (int j = i; j < num_parameters; ++j) { -#endif // CERES_NO_THREADS - -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - - // The parallel for abstraction does not have support for constraining the - // number of workers in nested parallel for loops. Consequently, we will try - // to evenly distribute the number of workers between the each parallel for - // loop. - // TODO(vitus): consolidate the nested for loops into a single loop which can - // be properly split between the threads. +#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) + for (int k = 0; k < num_parameters * num_parameters; ++k) { +#else problem_->context()->EnsureMinimumThreads(num_threads); - const int num_outer_threads = std::sqrt(num_threads); - const int num_inner_threads = num_threads / num_outer_threads; ParallelFor(problem_->context(), 0, - num_parameters, - num_outer_threads, - [&](int i) { - ParallelFor(problem_->context(), i, - num_parameters, - num_inner_threads, - [&](int j) { -#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) + num_parameters * num_parameters, + num_threads, + [&](int k) { +#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) + int i = k / num_parameters; + int j = k % num_parameters; + if (j < i) { +#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) + return; +#else + continue; +#endif + } int covariance_row_idx = cum_parameter_size[i]; int covariance_col_idx = cum_parameter_size[j]; @@ -415,10 +400,7 @@ } } #if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - ); - }); -#else - } + ); #endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) return success; }