Improve threading in covariance. Covariance computation wants to do a triangular iteration but as a single loop. Right now it iterates over a square and does nothing half the time, which is inefficient and has bad worst-case threading performance. This adds a utility that allows waste-free linear iteration over a triangle. Change-Id: I881d5683c65882f87dc2b5f8449a855d22ace755
diff --git a/BUILD b/BUILD index 25949c1..4018a68 100644 --- a/BUILD +++ b/BUILD
@@ -123,6 +123,7 @@ "numeric_diff_cost_function", "ordered_groups", "parallel_for", + "parallel_utils", "parameter_block_ordering", "parameter_block", "partitioned_matrix_view",
diff --git a/bazel/ceres.bzl b/bazel/ceres.bzl index 090707d..d90e5a3 100644 --- a/bazel/ceres.bzl +++ b/bazel/ceres.bzl
@@ -91,6 +91,7 @@ "normal_prior.cc", "parallel_for_cxx.cc", "parallel_for_tbb.cc", + "parallel_utils.cc", "parameter_block_ordering.cc", "partitioned_matrix_view.cc", "polynomial.cc",
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt index bddbe79..55cd0eb 100644 --- a/internal/ceres/CMakeLists.txt +++ b/internal/ceres/CMakeLists.txt
@@ -92,6 +92,7 @@ normal_prior.cc parallel_for_cxx.cc parallel_for_tbb.cc + parallel_utils.cc parameter_block_ordering.cc partitioned_matrix_view.cc polynomial.cc @@ -348,6 +349,7 @@ ceres_test(numeric_diff_cost_function) ceres_test(ordered_groups) ceres_test(parallel_for) + ceres_test(parallel_utils) ceres_test(parameter_block) ceres_test(parameter_block_ordering) ceres_test(partitioned_matrix_view)
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc index 8e13524..c52866b 100644 --- a/internal/ceres/covariance_impl.cc +++ b/internal/ceres/covariance_impl.cc
@@ -52,6 +52,7 @@ #include "ceres/crs_matrix.h" #include "ceres/internal/eigen.h" #include "ceres/map_util.h" +#include "ceres/parallel_utils.h" #include "ceres/parameter_block.h" #include "ceres/problem_impl.h" #include "ceres/residual_block.h" @@ -348,32 +349,22 @@ // Technically the following code is a double nested loop where // i = 1:n, j = i:n. - // - // To make things easier to parallelize, we instead iterate over k = - // 1:n^2, compute i * j ,and skip over the lower triangular part of - // the loop. + int iteration_count = (num_parameters * (num_parameters + 1)) / 2; #if defined(CERES_USE_OPENMP) # pragma omp parallel for num_threads(num_threads) schedule(dynamic) #endif // CERES_USE_OPENMP #if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - for (int k = 0; k < num_parameters * num_parameters; ++k) { + for (int k = 0; k < iteration_count; ++k) { #else problem_->context()->EnsureMinimumThreads(num_threads); ParallelFor(problem_->context(), 0, - num_parameters * num_parameters, + iteration_count, num_threads, [&](int thread_id, int k) { #endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - int i = k / num_parameters; - int j = k % num_parameters; - if (j < i) { -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - return; -#else - continue; -#endif - } + int i, j; + LinearIndexToUpperTriangularIndex(k, num_parameters, &i, &j); int covariance_row_idx = cum_parameter_size[i]; int covariance_col_idx = cum_parameter_size[j];
diff --git a/internal/ceres/parallel_utils.cc b/internal/ceres/parallel_utils.cc new file mode 100644 index 0000000..e1cb5f9 --- /dev/null +++ b/internal/ceres/parallel_utils.cc
@@ -0,0 +1,90 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2018 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Author: wjr@google.com (William Rucklidge) + +#include "ceres/parallel_utils.h" + +namespace ceres { +namespace internal { + +void LinearIndexToUpperTriangularIndex(int k, int n, int* i, int* j) { + // This works by unfolding a rectangle into a triangle. + // Say n is even. 4 is a nice even number. The 10 i,j pairs that we + // want to produce are: + // 0,0 0,1 0,2 0,3 + // 1,1 1,2 1,3 + // 2,2 2,3 + // 3,3 + // This triangle can be folded into a 5x2 rectangle: + // 3,3 0,0 0,1 0,2 0,3 + // 2,2 2,3 1,1 1,2 1,3 + + // If N is odd, say 5, then the 15 i,j pairs are: + // 0,0 0,1 0,2 0,3 0,4 + // 1,1 1,2 1,3 1,4 + // 2,2 2,3 2,3 + // 3,3 3,4 + // 4,4 + // which folds to a 5x3 rectangle: + // 0,0 0,1 0,2 0,3 0,4 + // 4,4 1,1 1,2 1,3 1,4 + // 3,3 3,4 2,2 2,3 2,4 + + // All this function does is map the linear iteration position to a + // location in the rectangle and work out the appropriate (i, j) for that + // location. + if (n & 1) { + // Odd n. The tip of the triangle is on row 1. + int w = n; // Width of the rectangle to unfold + int i0 = k / w; + int j0 = k % w; + if (j0 >= i0) { + *i = i0; + *j = j0; + } else { + *i = n - i0; + *j = *i + j0; + } + } else { + // Even n. The tip of the triangle is on row 0, making it one wider. + int w = n + 1; + int i0 = k / w; + int j0 = k % w; + if (j0 > i0) { + *i = i0; + *j = j0 - 1; + } else { + *i = n - 1 - i0; + *j = *i + j0; + } + } +} + +} // namespace internal +} // namespace ceres
diff --git a/internal/ceres/parallel_utils.h b/internal/ceres/parallel_utils.h new file mode 100644 index 0000000..1291428 --- /dev/null +++ b/internal/ceres/parallel_utils.h
@@ -0,0 +1,67 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2018 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Author: wjr@google.com (William Rucklidge) + +#ifndef CERES_INTERNAL_PARALLEL_UTILS_H_ +#define CERES_INTERNAL_PARALLEL_UTILS_H_ + +namespace ceres { +namespace internal { + +// Converts a linear iteration order into a triangular iteration order. +// Suppose you have nested loops that look like +// for (int i = 0; i < n; i++) { +// for (int j = i; j < n; j++) { +// ... use i and j +// } +// } +// Naively using ParallelFor to parallelise those loops might look like +// ParallelFor(..., 0, n * n, num_threads, +// [](int thread_id, int k) { +// int i = k / n, j = k % n; +// if (j < i) return; +// ... +// }); +// but these empty work items can lead to very unbalanced threading. Instead, +// do this: +// int actual_work_items = (n * (n + 1)) / 2; +// ParallelFor(..., 0, actual_work_items, num_threads, +// [](int thread_id, int k) { +// int i, j; +// UnfoldIteration(k, n, &i, &j); +// ... +// }); +// which in each iteration will produce i and j satisfying +// 0 <= i <= j < n +void LinearIndexToUpperTriangularIndex(int k, int n, int* i, int* j); + +} // namespace internal +} // namespace ceres + +#endif // CERES_INTERNAL_PARALLEL_UTILS_H_
diff --git a/internal/ceres/parallel_utils_test.cc b/internal/ceres/parallel_utils_test.cc new file mode 100644 index 0000000..f997d25 --- /dev/null +++ b/internal/ceres/parallel_utils_test.cc
@@ -0,0 +1,61 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2018 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Author: wjr@google.com (William Rucklidge) + +// This include must come before any #ifndef check on Ceres compile options. +#include "ceres/internal/port.h" +#include "ceres/parallel_utils.h" + +#include "glog/logging.h" +#include "gtest/gtest.h" + +namespace ceres { +namespace internal { + +// Tests that unfolding linear iterations to triangular iterations produces +// indices that are in-range and unique. +TEST(LinearIndexToUpperTriangularIndexTest, UniqueAndValid) { + for (int n = 0; n < 100; n++) { + std::set<std::pair<int, int>> seen_pairs; + int actual_work_items = (n * (n + 1)) / 2; + for (int k = 0; k < actual_work_items; k++) { + int i, j; + LinearIndexToUpperTriangularIndex(k, n, &i, &j); + EXPECT_GE(i, 0); + EXPECT_LT(i, n); + EXPECT_GE(j, i); + EXPECT_LT(j, n); + seen_pairs.insert(std::make_pair(i, j)); + } + EXPECT_EQ(actual_work_items, seen_pairs.size()); + } +} + +} // namespace internal +} // namespace ceres