Improve threading in covariance.
Covariance computation wants to do a triangular iteration but as a
single loop. Right now it iterates over a square and does nothing half
the time, which is inefficient and has bad worst-case threading
performance. This adds a utility that allows waste-free linear iteration
over a triangle.
Change-Id: I881d5683c65882f87dc2b5f8449a855d22ace755
diff --git a/BUILD b/BUILD
index 25949c1..4018a68 100644
--- a/BUILD
+++ b/BUILD
@@ -123,6 +123,7 @@
"numeric_diff_cost_function",
"ordered_groups",
"parallel_for",
+ "parallel_utils",
"parameter_block_ordering",
"parameter_block",
"partitioned_matrix_view",
diff --git a/bazel/ceres.bzl b/bazel/ceres.bzl
index 090707d..d90e5a3 100644
--- a/bazel/ceres.bzl
+++ b/bazel/ceres.bzl
@@ -91,6 +91,7 @@
"normal_prior.cc",
"parallel_for_cxx.cc",
"parallel_for_tbb.cc",
+ "parallel_utils.cc",
"parameter_block_ordering.cc",
"partitioned_matrix_view.cc",
"polynomial.cc",
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index bddbe79..55cd0eb 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -92,6 +92,7 @@
normal_prior.cc
parallel_for_cxx.cc
parallel_for_tbb.cc
+ parallel_utils.cc
parameter_block_ordering.cc
partitioned_matrix_view.cc
polynomial.cc
@@ -348,6 +349,7 @@
ceres_test(numeric_diff_cost_function)
ceres_test(ordered_groups)
ceres_test(parallel_for)
+ ceres_test(parallel_utils)
ceres_test(parameter_block)
ceres_test(parameter_block_ordering)
ceres_test(partitioned_matrix_view)
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc
index 8e13524..c52866b 100644
--- a/internal/ceres/covariance_impl.cc
+++ b/internal/ceres/covariance_impl.cc
@@ -52,6 +52,7 @@
#include "ceres/crs_matrix.h"
#include "ceres/internal/eigen.h"
#include "ceres/map_util.h"
+#include "ceres/parallel_utils.h"
#include "ceres/parameter_block.h"
#include "ceres/problem_impl.h"
#include "ceres/residual_block.h"
@@ -348,32 +349,22 @@
// Technically the following code is a double nested loop where
// i = 1:n, j = i:n.
- //
- // To make things easier to parallelize, we instead iterate over k =
- // 1:n^2, compute i * j ,and skip over the lower triangular part of
- // the loop.
+ int iteration_count = (num_parameters * (num_parameters + 1)) / 2;
#if defined(CERES_USE_OPENMP)
# pragma omp parallel for num_threads(num_threads) schedule(dynamic)
#endif // CERES_USE_OPENMP
#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- for (int k = 0; k < num_parameters * num_parameters; ++k) {
+ for (int k = 0; k < iteration_count; ++k) {
#else
problem_->context()->EnsureMinimumThreads(num_threads);
ParallelFor(problem_->context(),
0,
- num_parameters * num_parameters,
+ iteration_count,
num_threads,
[&](int thread_id, int k) {
#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- int i = k / num_parameters;
- int j = k % num_parameters;
- if (j < i) {
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
- return;
-#else
- continue;
-#endif
- }
+ int i, j;
+ LinearIndexToUpperTriangularIndex(k, num_parameters, &i, &j);
int covariance_row_idx = cum_parameter_size[i];
int covariance_col_idx = cum_parameter_size[j];
diff --git a/internal/ceres/parallel_utils.cc b/internal/ceres/parallel_utils.cc
new file mode 100644
index 0000000..e1cb5f9
--- /dev/null
+++ b/internal/ceres/parallel_utils.cc
@@ -0,0 +1,90 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2018 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wjr@google.com (William Rucklidge)
+
+#include "ceres/parallel_utils.h"
+
+namespace ceres {
+namespace internal {
+
+void LinearIndexToUpperTriangularIndex(int k, int n, int* i, int* j) {
+ // This works by unfolding a rectangle into a triangle.
+ // Say n is even. 4 is a nice even number. The 10 i,j pairs that we
+ // want to produce are:
+ // 0,0 0,1 0,2 0,3
+ // 1,1 1,2 1,3
+ // 2,2 2,3
+ // 3,3
+ // This triangle can be folded into a 5x2 rectangle:
+ // 3,3 0,0 0,1 0,2 0,3
+ // 2,2 2,3 1,1 1,2 1,3
+
+ // If N is odd, say 5, then the 15 i,j pairs are:
+ // 0,0 0,1 0,2 0,3 0,4
+ // 1,1 1,2 1,3 1,4
+ // 2,2 2,3 2,3
+ // 3,3 3,4
+ // 4,4
+ // which folds to a 5x3 rectangle:
+ // 0,0 0,1 0,2 0,3 0,4
+ // 4,4 1,1 1,2 1,3 1,4
+ // 3,3 3,4 2,2 2,3 2,4
+
+ // All this function does is map the linear iteration position to a
+ // location in the rectangle and work out the appropriate (i, j) for that
+ // location.
+ if (n & 1) {
+ // Odd n. The tip of the triangle is on row 1.
+ int w = n; // Width of the rectangle to unfold
+ int i0 = k / w;
+ int j0 = k % w;
+ if (j0 >= i0) {
+ *i = i0;
+ *j = j0;
+ } else {
+ *i = n - i0;
+ *j = *i + j0;
+ }
+ } else {
+ // Even n. The tip of the triangle is on row 0, making it one wider.
+ int w = n + 1;
+ int i0 = k / w;
+ int j0 = k % w;
+ if (j0 > i0) {
+ *i = i0;
+ *j = j0 - 1;
+ } else {
+ *i = n - 1 - i0;
+ *j = *i + j0;
+ }
+ }
+}
+
+} // namespace internal
+} // namespace ceres
diff --git a/internal/ceres/parallel_utils.h b/internal/ceres/parallel_utils.h
new file mode 100644
index 0000000..1291428
--- /dev/null
+++ b/internal/ceres/parallel_utils.h
@@ -0,0 +1,67 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2018 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wjr@google.com (William Rucklidge)
+
+#ifndef CERES_INTERNAL_PARALLEL_UTILS_H_
+#define CERES_INTERNAL_PARALLEL_UTILS_H_
+
+namespace ceres {
+namespace internal {
+
+// Converts a linear iteration order into a triangular iteration order.
+// Suppose you have nested loops that look like
+// for (int i = 0; i < n; i++) {
+// for (int j = i; j < n; j++) {
+// ... use i and j
+// }
+// }
+// Naively using ParallelFor to parallelise those loops might look like
+// ParallelFor(..., 0, n * n, num_threads,
+// [](int thread_id, int k) {
+// int i = k / n, j = k % n;
+// if (j < i) return;
+// ...
+// });
+// but these empty work items can lead to very unbalanced threading. Instead,
+// do this:
+// int actual_work_items = (n * (n + 1)) / 2;
+// ParallelFor(..., 0, actual_work_items, num_threads,
+// [](int thread_id, int k) {
+// int i, j;
+// UnfoldIteration(k, n, &i, &j);
+// ...
+// });
+// which in each iteration will produce i and j satisfying
+// 0 <= i <= j < n
+void LinearIndexToUpperTriangularIndex(int k, int n, int* i, int* j);
+
+} // namespace internal
+} // namespace ceres
+
+#endif // CERES_INTERNAL_PARALLEL_UTILS_H_
diff --git a/internal/ceres/parallel_utils_test.cc b/internal/ceres/parallel_utils_test.cc
new file mode 100644
index 0000000..f997d25
--- /dev/null
+++ b/internal/ceres/parallel_utils_test.cc
@@ -0,0 +1,61 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2018 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wjr@google.com (William Rucklidge)
+
+// This include must come before any #ifndef check on Ceres compile options.
+#include "ceres/internal/port.h"
+#include "ceres/parallel_utils.h"
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace ceres {
+namespace internal {
+
+// Tests that unfolding linear iterations to triangular iterations produces
+// indices that are in-range and unique.
+TEST(LinearIndexToUpperTriangularIndexTest, UniqueAndValid) {
+ for (int n = 0; n < 100; n++) {
+ std::set<std::pair<int, int>> seen_pairs;
+ int actual_work_items = (n * (n + 1)) / 2;
+ for (int k = 0; k < actual_work_items; k++) {
+ int i, j;
+ LinearIndexToUpperTriangularIndex(k, n, &i, &j);
+ EXPECT_GE(i, 0);
+ EXPECT_LT(i, n);
+ EXPECT_GE(j, i);
+ EXPECT_LT(j, n);
+ seen_pairs.insert(std::make_pair(i, j));
+ }
+ EXPECT_EQ(actual_work_items, seen_pairs.size());
+ }
+}
+
+} // namespace internal
+} // namespace ceres