Convert a nested loop into a linear loop in covariance_impl.h

This improves the readability and simplifies the logic
for interfacing with ParallelFor. More importantly, it paves the
way for ParallelFor refactoring to improve its performance.

Change-Id: I13b05596228900ee00d71f2ccce1db338844b9ab
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc
index f7c7126..8b34bc1 100644
--- a/internal/ceres/covariance_impl.cc
+++ b/internal/ceres/covariance_impl.cc
@@ -344,49 +344,34 @@
 
   ThreadTokenProvider thread_token_provider(num_threads);
 
-#ifdef CERES_USE_OPENMP
-// The collapse() directive is only supported in OpenMP 3.0 and higher. OpenMP
-// 3.0 was released in May 2008 (hence the version number).
-#  if _OPENMP >= 200805
-#    pragma omp parallel for num_threads(num_threads) schedule(dynamic) collapse(2)
-#  else
+  // Technically the following code is a double nested loop where
+  // i = 1:n, j = i:n.
+  //
+  // To make things easier to parallelize, we instead iterate over k =
+  // 1:n^2, compute i * j ,and skip over the lower triangular part of
+  // the loop.
+#if defined(CERES_USE_OPENMP)
 #    pragma omp parallel for num_threads(num_threads) schedule(dynamic)
-#  endif
-  for (int i = 0; i < num_parameters; ++i) {
-    for (int j = 0; j < num_parameters; ++j) {
-      // The second loop can't start from j = i for compatibility with OpenMP
-      // collapse command. The conditional serves as a workaround
-      if (j < i) {
-        continue;
-      }
 #endif // CERES_USE_OPENMP
-
-#ifdef CERES_NO_THREADS
-  for (int i = 0; i < num_parameters; ++i) {
-    for (int j = i; j < num_parameters; ++j) {
-#endif // CERES_NO_THREADS
-
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-
-  // The parallel for abstraction does not have support for constraining the
-  // number of workers in nested parallel for loops. Consequently, we will try
-  // to evenly distribute the number of workers between the each parallel for
-  // loop.
-  // TODO(vitus): consolidate the nested for loops into a single loop which can
-  // be properly split between the threads.
+#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
+  for (int k = 0; k < num_parameters * num_parameters; ++k) {
+#else
   problem_->context()->EnsureMinimumThreads(num_threads);
-  const int num_outer_threads = std::sqrt(num_threads);
-  const int num_inner_threads = num_threads / num_outer_threads;
   ParallelFor(problem_->context(),
               0,
-              num_parameters,
-              num_outer_threads,
-              [&](int i) {
-    ParallelFor(problem_->context(), i,
-                num_parameters,
-                num_inner_threads,
-                [&](int j) {
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+              num_parameters * num_parameters,
+              num_threads,
+              [&](int k) {
+#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
+      int i = k / num_parameters;
+      int j = k % num_parameters;
+      if (j < i) {
+#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+        return;
+#else
+        continue;
+#endif
+      }
 
       int covariance_row_idx = cum_parameter_size[i];
       int covariance_col_idx = cum_parameter_size[j];
@@ -415,10 +400,7 @@
       }
     }
 #if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-    );
-  });
-#else
-  }
+   );
 #endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
   return success;
 }