Improve threading in covariance.

Covariance computation wants to do a triangular iteration but as a
single loop. Right now it iterates over a square and does nothing half
the time, which is inefficient and has bad worst-case threading
performance. This adds a utility that allows waste-free linear iteration
over a triangle.

Change-Id: I881d5683c65882f87dc2b5f8449a855d22ace755
diff --git a/BUILD b/BUILD
index 25949c1..4018a68 100644
--- a/BUILD
+++ b/BUILD
@@ -123,6 +123,7 @@
     "numeric_diff_cost_function",
     "ordered_groups",
     "parallel_for",
+    "parallel_utils",
     "parameter_block_ordering",
     "parameter_block",
     "partitioned_matrix_view",
diff --git a/bazel/ceres.bzl b/bazel/ceres.bzl
index 090707d..d90e5a3 100644
--- a/bazel/ceres.bzl
+++ b/bazel/ceres.bzl
@@ -91,6 +91,7 @@
     "normal_prior.cc",
     "parallel_for_cxx.cc",
     "parallel_for_tbb.cc",
+    "parallel_utils.cc",
     "parameter_block_ordering.cc",
     "partitioned_matrix_view.cc",
     "polynomial.cc",
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index bddbe79..55cd0eb 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -92,6 +92,7 @@
     normal_prior.cc
     parallel_for_cxx.cc
     parallel_for_tbb.cc
+    parallel_utils.cc
     parameter_block_ordering.cc
     partitioned_matrix_view.cc
     polynomial.cc
@@ -348,6 +349,7 @@
   ceres_test(numeric_diff_cost_function)
   ceres_test(ordered_groups)
   ceres_test(parallel_for)
+  ceres_test(parallel_utils)
   ceres_test(parameter_block)
   ceres_test(parameter_block_ordering)
   ceres_test(partitioned_matrix_view)
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc
index 8e13524..c52866b 100644
--- a/internal/ceres/covariance_impl.cc
+++ b/internal/ceres/covariance_impl.cc
@@ -52,6 +52,7 @@
 #include "ceres/crs_matrix.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/map_util.h"
+#include "ceres/parallel_utils.h"
 #include "ceres/parameter_block.h"
 #include "ceres/problem_impl.h"
 #include "ceres/residual_block.h"
@@ -348,32 +349,22 @@
 
   // Technically the following code is a double nested loop where
   // i = 1:n, j = i:n.
-  //
-  // To make things easier to parallelize, we instead iterate over k =
-  // 1:n^2, compute i * j ,and skip over the lower triangular part of
-  // the loop.
+  int iteration_count = (num_parameters * (num_parameters + 1)) / 2;
 #if defined(CERES_USE_OPENMP)
 #    pragma omp parallel for num_threads(num_threads) schedule(dynamic)
 #endif // CERES_USE_OPENMP
 #if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-  for (int k = 0; k < num_parameters * num_parameters; ++k) {
+  for (int k = 0; k < iteration_count; ++k) {
 #else
   problem_->context()->EnsureMinimumThreads(num_threads);
   ParallelFor(problem_->context(),
               0,
-              num_parameters * num_parameters,
+              iteration_count,
               num_threads,
               [&](int thread_id, int k) {
 #endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-      int i = k / num_parameters;
-      int j = k % num_parameters;
-      if (j < i) {
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-        return;
-#else
-        continue;
-#endif
-      }
+      int i, j;
+      LinearIndexToUpperTriangularIndex(k, num_parameters, &i, &j);
 
       int covariance_row_idx = cum_parameter_size[i];
       int covariance_col_idx = cum_parameter_size[j];
diff --git a/internal/ceres/parallel_utils.cc b/internal/ceres/parallel_utils.cc
new file mode 100644
index 0000000..e1cb5f9
--- /dev/null
+++ b/internal/ceres/parallel_utils.cc
@@ -0,0 +1,90 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2018 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wjr@google.com (William Rucklidge)
+
+#include "ceres/parallel_utils.h"
+
+namespace ceres {
+namespace internal {
+
+void LinearIndexToUpperTriangularIndex(int k, int n, int* i, int* j) {
+  // This works by unfolding a rectangle into a triangle.
+  // Say n is even. 4 is a nice even number. The 10 i,j pairs that we
+  // want to produce are:
+  // 0,0 0,1 0,2 0,3
+  //     1,1 1,2 1,3
+  //         2,2 2,3
+  //             3,3
+  // This triangle can be folded into a 5x2 rectangle:
+  // 3,3 0,0 0,1 0,2 0,3
+  // 2,2 2,3 1,1 1,2 1,3
+
+  // If N is odd, say 5, then the 15 i,j pairs are:
+  // 0,0 0,1 0,2 0,3 0,4
+  //     1,1 1,2 1,3 1,4
+  //         2,2 2,3 2,3
+  //             3,3 3,4
+  //                 4,4
+  // which folds to a 5x3 rectangle:
+  // 0,0 0,1 0,2 0,3 0,4
+  // 4,4 1,1 1,2 1,3 1,4
+  // 3,3 3,4 2,2 2,3 2,4
+
+  // All this function does is map the linear iteration position to a
+  // location in the rectangle and work out the appropriate (i, j) for that
+  // location.
+  if (n & 1) {
+    // Odd n. The tip of the triangle is on row 1.
+    int w = n;  // Width of the rectangle to unfold
+    int i0 = k / w;
+    int j0 = k % w;
+    if (j0 >= i0) {
+      *i = i0;
+      *j = j0;
+    } else {
+      *i = n - i0;
+      *j = *i + j0;
+    }
+  } else {
+    // Even n. The tip of the triangle is on row 0, making it one wider.
+    int w = n + 1;
+    int i0 = k / w;
+    int j0 = k % w;
+    if (j0 > i0) {
+      *i = i0;
+      *j = j0 - 1;
+    } else {
+      *i = n - 1 - i0;
+      *j = *i + j0;
+    }
+  }
+}
+
+}  // namespace internal
+}  // namespace ceres
diff --git a/internal/ceres/parallel_utils.h b/internal/ceres/parallel_utils.h
new file mode 100644
index 0000000..1291428
--- /dev/null
+++ b/internal/ceres/parallel_utils.h
@@ -0,0 +1,67 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2018 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wjr@google.com (William Rucklidge)
+
+#ifndef CERES_INTERNAL_PARALLEL_UTILS_H_
+#define CERES_INTERNAL_PARALLEL_UTILS_H_
+
+namespace ceres {
+namespace internal {
+
+// Converts a linear iteration order into a triangular iteration order.
+// Suppose you have nested loops that look like
+// for (int i = 0; i < n; i++) {
+//   for (int j = i; j < n; j++) {
+//     ... use i and j
+//   }
+// }
+// Naively using ParallelFor to parallelise those loops might look like
+// ParallelFor(..., 0, n * n, num_threads,
+//   [](int thread_id, int k) {
+//     int i = k / n, j = k % n;
+//     if (j < i) return;
+//     ...
+//    });
+// but these empty work items can lead to very unbalanced threading. Instead,
+// do this:
+// int actual_work_items = (n * (n + 1)) / 2;
+// ParallelFor(..., 0, actual_work_items, num_threads,
+//   [](int thread_id, int k) {
+//     int i, j;
+//     UnfoldIteration(k, n, &i, &j);
+//     ...
+//    });
+// which in each iteration will produce i and j satisfying
+// 0 <= i <= j < n
+void LinearIndexToUpperTriangularIndex(int k, int n, int* i, int* j);
+
+}  // namespace internal
+}  // namespace ceres
+
+#endif  // CERES_INTERNAL_PARALLEL_UTILS_H_
diff --git a/internal/ceres/parallel_utils_test.cc b/internal/ceres/parallel_utils_test.cc
new file mode 100644
index 0000000..f997d25
--- /dev/null
+++ b/internal/ceres/parallel_utils_test.cc
@@ -0,0 +1,61 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2018 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wjr@google.com (William Rucklidge)
+
+// This include must come before any #ifndef check on Ceres compile options.
+#include "ceres/internal/port.h"
+#include "ceres/parallel_utils.h"
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace ceres {
+namespace internal {
+
+// Tests that unfolding linear iterations to triangular iterations produces
+// indices that are in-range and unique.
+TEST(LinearIndexToUpperTriangularIndexTest, UniqueAndValid) {
+  for (int n = 0; n < 100; n++) {
+    std::set<std::pair<int, int>> seen_pairs;
+    int actual_work_items = (n * (n + 1)) / 2;
+    for (int k = 0; k < actual_work_items; k++) {
+      int i, j;
+      LinearIndexToUpperTriangularIndex(k, n, &i, &j);
+      EXPECT_GE(i, 0);
+      EXPECT_LT(i, n);
+      EXPECT_GE(j, i);
+      EXPECT_LT(j, n);
+      seen_pairs.insert(std::make_pair(i, j));
+    }
+    EXPECT_EQ(actual_work_items, seen_pairs.size());
+  }
+}
+
+}  // namespace internal
+}  // namespace ceres