Adds a ParallelFor wrapper for no threads and OpenMP. With the addition of C++11 support we can simplify the parallel for code by removing the ifdef branching. Converts coordinate_descent_minimizer.cc to use the thread_id ParallelFor API. Tested by building with OpenMP, C++11 threads, TBB, and no threads. All tests pass. Also compared timing via the bundle adjuster. ./bin/bundle_adjuster --input=../problem-744-543562-pre.txt With OpenMP num_threads=8 Head: Time (in seconds): Residual only evaluation 0.807753 (5) Jacobian & residual evaluation 4.489404 (6) Linear solver 41.826481 (5) Minimizer 50.745857 Total 73.294424 CL: Time (in seconds): Residual only evaluation 0.970483 (5) Jacobian & residual evaluation 4.647438 (6) Linear solver 41.781892 (5) Minimizer 50.848904 Total 73.089983 With OpenMP num_threads=1 HEAD: Time (in seconds): Residual only evaluation 2.990246 (5) Jacobian & residual evaluation 14.132090 (6) Linear solver 79.631951 (5) Minimizer 100.281847 Total 122.946267 CL: Time (in seconds): Residual only evaluation 3.075178 (5) Jacobian & residual evaluation 13.966451 (6) Linear solver 77.005441 (5) Minimizer 97.568712 Total 120.410454 Change-Id: I1857d7943073be7465b6c6476bf46ab11c5475a3
diff --git a/bazel/ceres.bzl b/bazel/ceres.bzl index 6ba0137..8963370 100644 --- a/bazel/ceres.bzl +++ b/bazel/ceres.bzl
@@ -91,6 +91,7 @@ "minimizer.cc", "normal_prior.cc", "parallel_for_cxx.cc", + "parallel_for_openmp.cc", "parallel_for_tbb.cc", "parallel_utils.cc", "parameter_block_ordering.cc",
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt index 8924173..1dad396 100644 --- a/internal/ceres/CMakeLists.txt +++ b/internal/ceres/CMakeLists.txt
@@ -91,6 +91,7 @@ low_rank_inverse_hessian.cc minimizer.cc normal_prior.cc + parallel_for_openmp.cc parallel_for_cxx.cc parallel_for_tbb.cc parallel_utils.cc
diff --git a/internal/ceres/coordinate_descent_minimizer.cc b/internal/ceres/coordinate_descent_minimizer.cc index 6f20f20..558faed 100644 --- a/internal/ceres/coordinate_descent_minimizer.cc +++ b/internal/ceres/coordinate_descent_minimizer.cc
@@ -30,10 +30,6 @@ #include "ceres/coordinate_descent_minimizer.h" -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) -#include "ceres/parallel_for.h" -#endif - #include <algorithm> #include <iterator> #include <memory> @@ -43,14 +39,13 @@ #include "ceres/evaluator.h" #include "ceres/linear_solver.h" #include "ceres/minimizer.h" +#include "ceres/parallel_for.h" #include "ceres/parameter_block.h" #include "ceres/parameter_block_ordering.h" #include "ceres/problem_impl.h" #include "ceres/program.h" #include "ceres/residual_block.h" -#include "ceres/scoped_thread_token.h" #include "ceres/solver.h" -#include "ceres/thread_token_provider.h" #include "ceres/trust_region_minimizer.h" #include "ceres/trust_region_strategy.h" @@ -164,62 +159,45 @@ evaluator_options_.num_threads = max(1, options.num_threads / num_inner_iteration_threads); - ThreadTokenProvider thread_token_provider(num_inner_iteration_threads); - -#ifdef CERES_USE_OPENMP // The parameter blocks in each independent set can be optimized // in parallel, since they do not co-occur in any residual block. -#pragma omp parallel for num_threads(num_inner_iteration_threads) -#endif + ParallelFor( + context_, + independent_set_offsets_[i], + independent_set_offsets_[i + 1], + num_inner_iteration_threads, + [&](int thread_id, int j) { + ParameterBlock* parameter_block = parameter_blocks_[j]; + const int old_index = parameter_block->index(); + const int old_delta_offset = parameter_block->delta_offset(); + parameter_block->SetVarying(); + parameter_block->set_index(0); + parameter_block->set_delta_offset(0); -#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - for (int j = independent_set_offsets_[i]; - j < independent_set_offsets_[i + 1]; - ++j) { -#else - ParallelFor(context_, - independent_set_offsets_[i], - independent_set_offsets_[i + 1], - num_inner_iteration_threads, - [&](int j) { -#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) + Program inner_program; + inner_program.mutable_parameter_blocks()->push_back(parameter_block); + *inner_program.mutable_residual_blocks() = residual_blocks_[j]; - const ScopedThreadToken scoped_thread_token(&thread_token_provider); - const int thread_id = scoped_thread_token.token(); + // TODO(sameeragarwal): Better error handling. Right now we + // assume that this is not going to lead to problems of any + // sort. Basically we should be checking for numerical failure + // of some sort. + // + // On the other hand, if the optimization is a failure, that in + // some ways is fine, since it won't change the parameters and + // we are fine. + Solver::Summary inner_summary; + Solve(&inner_program, + linear_solvers[thread_id], + parameters + parameter_block->state_offset(), + &inner_summary); - ParameterBlock* parameter_block = parameter_blocks_[j]; - const int old_index = parameter_block->index(); - const int old_delta_offset = parameter_block->delta_offset(); - parameter_block->SetVarying(); - parameter_block->set_index(0); - parameter_block->set_delta_offset(0); - - Program inner_program; - inner_program.mutable_parameter_blocks()->push_back(parameter_block); - *inner_program.mutable_residual_blocks() = residual_blocks_[j]; - - // TODO(sameeragarwal): Better error handling. Right now we - // assume that this is not going to lead to problems of any - // sort. Basically we should be checking for numerical failure - // of some sort. - // - // On the other hand, if the optimization is a failure, that in - // some ways is fine, since it won't change the parameters and - // we are fine. - Solver::Summary inner_summary; - Solve(&inner_program, - linear_solvers[thread_id], - parameters + parameter_block->state_offset(), - &inner_summary); - - parameter_block->set_index(old_index); - parameter_block->set_delta_offset(old_delta_offset); - parameter_block->SetState(parameters + parameter_block->state_offset()); - parameter_block->SetConstant(); - } -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - ); -#endif + parameter_block->set_index(old_index); + parameter_block->set_delta_offset(old_delta_offset); + parameter_block->SetState(parameters + + parameter_block->state_offset()); + parameter_block->SetConstant(); + }); } for (int i = 0; i < parameter_blocks_.size(); ++i) {
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc index 2552416..7c63c2c 100644 --- a/internal/ceres/covariance_impl.cc +++ b/internal/ceres/covariance_impl.cc
@@ -30,10 +30,6 @@ #include "ceres/covariance_impl.h" -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) -#include "ceres/parallel_for.h" -#endif - #include <algorithm> #include <cstdlib> #include <memory> @@ -53,13 +49,12 @@ #include "ceres/crs_matrix.h" #include "ceres/internal/eigen.h" #include "ceres/map_util.h" +#include "ceres/parallel_for.h" #include "ceres/parallel_utils.h" #include "ceres/parameter_block.h" #include "ceres/problem_impl.h" #include "ceres/residual_block.h" -#include "ceres/scoped_thread_token.h" #include "ceres/suitesparse.h" -#include "ceres/thread_token_provider.h" #include "ceres/wall_time.h" #include "glog/logging.h" @@ -344,60 +339,41 @@ bool success = true; -#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - ThreadTokenProvider thread_token_provider(num_threads); -#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - // Technically the following code is a double nested loop where // i = 1:n, j = i:n. int iteration_count = (num_parameters * (num_parameters + 1)) / 2; -#if defined(CERES_USE_OPENMP) -# pragma omp parallel for num_threads(num_threads) schedule(dynamic) -#endif // CERES_USE_OPENMP -#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - for (int k = 0; k < iteration_count; ++k) { -#else problem_->context()->EnsureMinimumThreads(num_threads); - ParallelFor(problem_->context(), - 0, - iteration_count, - num_threads, - [&](int thread_id, int k) { -#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - int i, j; - LinearIndexToUpperTriangularIndex(k, num_parameters, &i, &j); + ParallelFor( + problem_->context(), + 0, + iteration_count, + num_threads, + [&](int thread_id, int k) { + int i, j; + LinearIndexToUpperTriangularIndex(k, num_parameters, &i, &j); - int covariance_row_idx = cum_parameter_size[i]; - int covariance_col_idx = cum_parameter_size[j]; - int size_i = parameter_sizes[i]; - int size_j = parameter_sizes[j]; -#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - const ScopedThreadToken scoped_thread_token(&thread_token_provider); - const int thread_id = scoped_thread_token.token(); -#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - double* covariance_block = - workspace.get() + - thread_id * max_covariance_block_size * max_covariance_block_size; - if (!GetCovarianceBlockInTangentOrAmbientSpace( - parameters[i], parameters[j], lift_covariance_to_ambient_space, - covariance_block)) { - success = false; - } + int covariance_row_idx = cum_parameter_size[i]; + int covariance_col_idx = cum_parameter_size[j]; + int size_i = parameter_sizes[i]; + int size_j = parameter_sizes[j]; + double* covariance_block = + workspace.get() + thread_id * max_covariance_block_size * + max_covariance_block_size; + if (!GetCovarianceBlockInTangentOrAmbientSpace( + parameters[i], parameters[j], + lift_covariance_to_ambient_space, covariance_block)) { + success = false; + } - covariance.block(covariance_row_idx, covariance_col_idx, - size_i, size_j) = - MatrixRef(covariance_block, size_i, size_j); + covariance.block(covariance_row_idx, covariance_col_idx, size_i, + size_j) = MatrixRef(covariance_block, size_i, size_j); - if (i != j) { - covariance.block(covariance_col_idx, covariance_row_idx, - size_j, size_i) = - MatrixRef(covariance_block, size_i, size_j).transpose(); - - } - } -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - ); -#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) + if (i != j) { + covariance.block(covariance_col_idx, covariance_row_idx, + size_j, size_i) = + MatrixRef(covariance_block, size_i, size_j).transpose(); + } + }); return success; } @@ -709,50 +685,27 @@ const int num_threads = options_.num_threads; std::unique_ptr<double[]> workspace(new double[num_threads * num_cols]); -#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - ThreadTokenProvider thread_token_provider(num_threads); -#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - -#ifdef CERES_USE_OPENMP -#pragma omp parallel for num_threads(num_threads) schedule(dynamic) -#endif // CERES_USE_OPENMP - -#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - for (int r = 0; r < num_cols; ++r) { -#else problem_->context()->EnsureMinimumThreads(num_threads); - ParallelFor(problem_->context(), - 0, - num_cols, - num_threads, - [&](int thread_id, int r) { -#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - - const int row_begin = rows[r]; - const int row_end = rows[r + 1]; - if (row_end != row_begin) { -#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - const ScopedThreadToken scoped_thread_token(&thread_token_provider); - const int thread_id = scoped_thread_token.token(); -#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - - double* solution = workspace.get() + thread_id * num_cols; - SolveRTRWithSparseRHS<SuiteSparse_long>( - num_cols, - static_cast<SuiteSparse_long*>(R->i), - static_cast<SuiteSparse_long*>(R->p), - static_cast<double*>(R->x), - inverse_permutation[r], - solution); - for (int idx = row_begin; idx < row_end; ++idx) { - const int c = cols[idx]; - values[idx] = solution[inverse_permutation[c]]; - } - } - } -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - ); -#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) + ParallelFor( + problem_->context(), + 0, + num_cols, + num_threads, + [&](int thread_id, int r) { + const int row_begin = rows[r]; + const int row_end = rows[r + 1]; + if (row_end != row_begin) { + double* solution = workspace.get() + thread_id * num_cols; + SolveRTRWithSparseRHS<SuiteSparse_long>( + num_cols, static_cast<SuiteSparse_long*>(R->i), + static_cast<SuiteSparse_long*>(R->p), static_cast<double*>(R->x), + inverse_permutation[r], solution); + for (int idx = row_begin; idx < row_end; ++idx) { + const int c = cols[idx]; + values[idx] = solution[inverse_permutation[c]]; + } + } + }); free(permutation); cholmod_l_free_sparse(&R, &cc); @@ -918,54 +871,33 @@ const int num_threads = options_.num_threads; std::unique_ptr<double[]> workspace(new double[num_threads * num_cols]); -#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - ThreadTokenProvider thread_token_provider(num_threads); -#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - -#ifdef CERES_USE_OPENMP -#pragma omp parallel for num_threads(num_threads) schedule(dynamic) -#endif // CERES_USE_OPENMP - -#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - for (int r = 0; r < num_cols; ++r) { -#else problem_->context()->EnsureMinimumThreads(num_threads); - ParallelFor(problem_->context(), - 0, + ParallelFor( + problem_->context(), + 0, + num_cols, + num_threads, + [&](int thread_id, int r) { + const int row_begin = rows[r]; + const int row_end = rows[r + 1]; + if (row_end != row_begin) { + double* solution = workspace.get() + thread_id * num_cols; + SolveRTRWithSparseRHS<int>( num_cols, - num_threads, - [&](int thread_id, int r) { -#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) + qr_solver.matrixR().innerIndexPtr(), + qr_solver.matrixR().outerIndexPtr(), + &qr_solver.matrixR().data().value(0), + inverse_permutation.indices().coeff(r), + solution); - const int row_begin = rows[r]; - const int row_end = rows[r + 1]; - if (row_end != row_begin) { -#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - const ScopedThreadToken scoped_thread_token(&thread_token_provider); - const int thread_id = scoped_thread_token.token(); -#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - - double* solution = workspace.get() + thread_id * num_cols; - SolveRTRWithSparseRHS<int>( - num_cols, - qr_solver.matrixR().innerIndexPtr(), - qr_solver.matrixR().outerIndexPtr(), - &qr_solver.matrixR().data().value(0), - inverse_permutation.indices().coeff(r), - solution); - - // Assign the values of the computed covariance using the - // inverse permutation used in the QR factorization. - for (int idx = row_begin; idx < row_end; ++idx) { - const int c = cols[idx]; - values[idx] = solution[inverse_permutation.indices().coeff(c)]; - } - } - } - -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - ); -#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) + // Assign the values of the computed covariance using the + // inverse permutation used in the QR factorization. + for (int idx = row_begin; idx < row_end; ++idx) { + const int c = cols[idx]; + values[idx] = solution[inverse_permutation.indices().coeff(c)]; + } + } + }); event_logger.AddEvent("Inverse");
diff --git a/internal/ceres/parallel_for_openmp.cc b/internal/ceres/parallel_for_openmp.cc new file mode 100644 index 0000000..b64f3d7 --- /dev/null +++ b/internal/ceres/parallel_for_openmp.cc
@@ -0,0 +1,83 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2018 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Author: vitus@google.com (Michael Vitus) + +// This include must come before any #ifndef check on Ceres compile options. +#include "ceres/internal/port.h" + +#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) + +#include "ceres/parallel_for.h" + +#include "ceres/scoped_thread_token.h" +#include "ceres/thread_token_provider.h" +#include "glog/logging.h" + +namespace ceres { +namespace internal { + +void ParallelFor(ContextImpl* context, + int start, + int end, + int num_threads, + const std::function<void(int)>& function) { + CHECK_GT(num_threads, 0); + CHECK(context != NULL); + if (end <= start) { + return; + } + +#ifdef CERES_USE_OPENMP +#pragma omp parallel for num_threads(num_threads) \ + schedule(dynamic) if (num_threads > 1) +#endif // CERES_USE_OPENMP + for (int i = start; i < end; ++i) { + function(i); + } +} + +void ParallelFor(ContextImpl* context, + int start, + int end, + int num_threads, + const std::function<void(int thread_id, int i)>& function) { + CHECK(context != NULL); + + ThreadTokenProvider thread_token_provider(num_threads); + ParallelFor(context, start, end, num_threads, [&](int i) { + const ScopedThreadToken scoped_thread_token(&thread_token_provider); + const int thread_id = scoped_thread_token.token(); + function(thread_id, i); + }); +} + +} // namespace internal +} // namespace ceres + +#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
diff --git a/internal/ceres/parallel_for_test.cc b/internal/ceres/parallel_for_test.cc index 2de4865..04e5783 100644 --- a/internal/ceres/parallel_for_test.cc +++ b/internal/ceres/parallel_for_test.cc
@@ -31,8 +31,6 @@ // This include must come before any #ifndef check on Ceres compile options. #include "ceres/internal/port.h" -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - #include "ceres/parallel_for.h" #include <cmath> @@ -129,6 +127,8 @@ } } +// This test is only valid when multithreading support is enabled. +#ifndef CERES_NO_THREADS TEST(ParallelForWithThreadId, UniqueThreadIds) { // Ensure the hardware supports more than 1 thread to ensure the test will // pass. @@ -157,8 +157,7 @@ EXPECT_THAT(x, UnorderedElementsAreArray({0,1})); } +#endif // CERES_NO_THREADS } // namespace internal } // namespace ceres - -#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
diff --git a/internal/ceres/program_evaluator.h b/internal/ceres/program_evaluator.h index 17175a6..13f2e83 100644 --- a/internal/ceres/program_evaluator.h +++ b/internal/ceres/program_evaluator.h
@@ -82,25 +82,20 @@ // This include must come before any #ifndef check on Ceres compile options. #include "ceres/internal/port.h" +#include <atomic> #include <map> #include <memory> #include <string> #include <vector> + #include "ceres/evaluation_callback.h" #include "ceres/execution_summary.h" #include "ceres/internal/eigen.h" +#include "ceres/parallel_for.h" #include "ceres/parameter_block.h" #include "ceres/program.h" #include "ceres/residual_block.h" -#include "ceres/scoped_thread_token.h" #include "ceres/small_blas.h" -#include "ceres/thread_token_provider.h" - -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) -#include <atomic> - -#include "ceres/parallel_for.h" -#endif namespace ceres { namespace internal { @@ -183,128 +178,84 @@ } const int num_residual_blocks = program_->NumResidualBlocks(); - -#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - ThreadTokenProvider thread_token_provider(options_.num_threads); -#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - -#ifdef CERES_USE_OPENMP - // This bool is used to disable the loop if an error is encountered - // without breaking out of it. The remaining loop iterations are still run, - // but with an empty body, and so will finish quickly. - bool abort = false; -#pragma omp parallel for num_threads(options_.num_threads) - for (int i = 0; i < num_residual_blocks; ++i) { -// Disable the loop instead of breaking, as required by OpenMP. -#pragma omp flush(abort) -#endif // CERES_USE_OPENMP - -#ifdef CERES_NO_THREADS - bool abort = false; - for (int i = 0; i < num_residual_blocks; ++i) { -#endif // CERES_NO_THREADS - -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) + // This bool is used to disable the loop if an error is encountered without + // breaking out of it. The remaining loop iterations are still run, but with + // an empty body, and so will finish quickly. std::atomic_bool abort(false); - - ParallelFor(options_.context, - 0, - num_residual_blocks, - options_.num_threads, - [&](int thread_id, int i) { -#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - - if (abort) { -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - return; -#else - continue; -#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - } - -#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - const ScopedThreadToken scoped_thread_token(&thread_token_provider); - const int thread_id = scoped_thread_token.token(); -#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - - EvaluatePreparer* preparer = &evaluate_preparers_[thread_id]; - EvaluateScratch* scratch = &evaluate_scratch_[thread_id]; - - // Prepare block residuals if requested. - const ResidualBlock* residual_block = program_->residual_blocks()[i]; - double* block_residuals = NULL; - if (residuals != NULL) { - block_residuals = residuals + residual_layout_[i]; - } else if (gradient != NULL) { - block_residuals = scratch->residual_block_residuals.get(); - } - - // Prepare block jacobians if requested. - double** block_jacobians = NULL; - if (jacobian != NULL || gradient != NULL) { - preparer->Prepare(residual_block, - i, - jacobian, - scratch->jacobian_block_ptrs.get()); - block_jacobians = scratch->jacobian_block_ptrs.get(); - } - - // Evaluate the cost, residuals, and jacobians. - double block_cost; - if (!residual_block->Evaluate( - evaluate_options.apply_loss_function, - &block_cost, - block_residuals, - block_jacobians, - scratch->residual_block_evaluate_scratch.get())) { - abort = true; -#ifdef CERES_USE_OPENMP -// This ensures that the OpenMP threads have a consistent view of 'abort'. Do -// the flush inside the failure case so that there is usually only one -// synchronization point per loop iteration instead of two. -#pragma omp flush(abort) -#endif // CERES_USE_OPENMP - -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - return; -#else - continue; -#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - } - - scratch->cost += block_cost; - - // Store the jacobians, if they were requested. - if (jacobian != NULL) { - jacobian_writer_.Write(i, - residual_layout_[i], - block_jacobians, - jacobian); - } - - // Compute and store the gradient, if it was requested. - if (gradient != NULL) { - int num_residuals = residual_block->NumResiduals(); - int num_parameter_blocks = residual_block->NumParameterBlocks(); - for (int j = 0; j < num_parameter_blocks; ++j) { - const ParameterBlock* parameter_block = - residual_block->parameter_blocks()[j]; - if (parameter_block->IsConstant()) { - continue; + ParallelFor( + options_.context, + 0, + num_residual_blocks, + options_.num_threads, + [&](int thread_id, int i) { + if (abort) { + return; } - MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>( - block_jacobians[j], - num_residuals, - parameter_block->LocalSize(), - block_residuals, - scratch->gradient.get() + parameter_block->delta_offset()); - } - } - } -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - ); -#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) + EvaluatePreparer* preparer = &evaluate_preparers_[thread_id]; + EvaluateScratch* scratch = &evaluate_scratch_[thread_id]; + + // Prepare block residuals if requested. + const ResidualBlock* residual_block = program_->residual_blocks()[i]; + double* block_residuals = NULL; + if (residuals != NULL) { + block_residuals = residuals + residual_layout_[i]; + } else if (gradient != NULL) { + block_residuals = scratch->residual_block_residuals.get(); + } + + // Prepare block jacobians if requested. + double** block_jacobians = NULL; + if (jacobian != NULL || gradient != NULL) { + preparer->Prepare(residual_block, + i, + jacobian, + scratch->jacobian_block_ptrs.get()); + block_jacobians = scratch->jacobian_block_ptrs.get(); + } + + // Evaluate the cost, residuals, and jacobians. + double block_cost; + if (!residual_block->Evaluate( + evaluate_options.apply_loss_function, + &block_cost, + block_residuals, + block_jacobians, + scratch->residual_block_evaluate_scratch.get())) { + abort = true; + return; + } + + scratch->cost += block_cost; + + // Store the jacobians, if they were requested. + if (jacobian != NULL) { + jacobian_writer_.Write(i, + residual_layout_[i], + block_jacobians, + jacobian); + } + + // Compute and store the gradient, if it was requested. + if (gradient != NULL) { + int num_residuals = residual_block->NumResiduals(); + int num_parameter_blocks = residual_block->NumParameterBlocks(); + for (int j = 0; j < num_parameter_blocks; ++j) { + const ParameterBlock* parameter_block = + residual_block->parameter_blocks()[j]; + if (parameter_block->IsConstant()) { + continue; + } + + MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>( + block_jacobians[j], + num_residuals, + parameter_block->LocalSize(), + block_residuals, + scratch->gradient.get() + parameter_block->delta_offset()); + } + } + }); if (!abort) { const int num_parameters = program_->NumEffectiveParameters();
diff --git a/internal/ceres/schur_eliminator_impl.h b/internal/ceres/schur_eliminator_impl.h index da5d922..203dcc9 100644 --- a/internal/ceres/schur_eliminator_impl.h +++ b/internal/ceres/schur_eliminator_impl.h
@@ -51,6 +51,7 @@ #include <algorithm> #include <map> +#include "Eigen/Dense" #include "ceres/block_random_access_matrix.h" #include "ceres/block_sparse_matrix.h" #include "ceres/block_structure.h" @@ -58,18 +59,14 @@ #include "ceres/internal/fixed_array.h" #include "ceres/invert_psd_matrix.h" #include "ceres/map_util.h" +#include "ceres/parallel_for.h" #include "ceres/schur_eliminator.h" #include "ceres/scoped_thread_token.h" #include "ceres/small_blas.h" #include "ceres/stl_util.h" #include "ceres/thread_token_provider.h" -#include "Eigen/Dense" #include "glog/logging.h" -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) -#include "ceres/parallel_for.h" -#endif - namespace ceres { namespace internal { @@ -190,43 +187,29 @@ // Add the diagonal to the schur complement. if (D != NULL) { -#ifdef CERES_USE_OPENMP -#pragma omp parallel for num_threads(num_threads_) schedule(dynamic) -#endif // CERES_USE_OPENMP + ParallelFor( + context_, + num_eliminate_blocks_, + num_col_blocks, + num_threads_, + [&](int i) { + const int block_id = i - num_eliminate_blocks_; + int r, c, row_stride, col_stride; + CellInfo* cell_info = lhs->GetCell(block_id, block_id, &r, &c, + &row_stride, &col_stride); + if (cell_info != NULL) { + const int block_size = bs->cols[i].size; + typename EigenTypes<Eigen::Dynamic>::ConstVectorRef diag( + D + bs->cols[i].position, block_size); -#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - for (int i = num_eliminate_blocks_; i < num_col_blocks; ++i) { -#else - ParallelFor(context_, num_eliminate_blocks_, num_col_blocks, num_threads_, - [&](int i) { -#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - - const int block_id = i - num_eliminate_blocks_; - int r, c, row_stride, col_stride; - CellInfo* cell_info = lhs->GetCell(block_id, block_id, - &r, &c, - &row_stride, &col_stride); - if (cell_info != NULL) { - const int block_size = bs->cols[i].size; - typename EigenTypes<Eigen::Dynamic>::ConstVectorRef - diag(D + bs->cols[i].position, block_size); - - std::lock_guard<std::mutex> l(cell_info->m); - MatrixRef m(cell_info->values, row_stride, col_stride); - m.block(r, c, block_size, block_size).diagonal() - += diag.array().square().matrix(); - } - } -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - ); -#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) + std::lock_guard<std::mutex> l(cell_info->m); + MatrixRef m(cell_info->values, row_stride, col_stride); + m.block(r, c, block_size, block_size).diagonal() += + diag.array().square().matrix(); + } + }); } -#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - ThreadTokenProvider thread_token_provider(num_threads_); -#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - -#ifdef CERES_USE_OPENMP // Eliminate y blocks one chunk at a time. For each chunk, compute // the entries of the normal equations and the gradient vector block // corresponding to the y block and then apply Gaussian elimination @@ -240,88 +223,76 @@ // z blocks that share a row block/residual term with the y // block. EliminateRowOuterProduct does the corresponding operation // for the lhs of the reduced linear system. -#pragma omp parallel for num_threads(num_threads_) schedule(dynamic) -#endif // CERES_USE_OPENMP + ParallelFor( + context_, + 0, + int(chunks_.size()), + num_threads_, + [&](int thread_id, int i) { + double* buffer = buffer_.get() + thread_id * buffer_size_; + const Chunk& chunk = chunks_[i]; + const int e_block_id = bs->rows[chunk.start].cells.front().block_id; + const int e_block_size = bs->cols[e_block_id].size; -#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - for (int i = 0; i < chunks_.size(); ++i) { - const ScopedThreadToken scoped_thread_token(&thread_token_provider); - const int thread_id = scoped_thread_token.token(); -#else - ParallelFor(context_, - 0, - int(chunks_.size()), - num_threads_, - [&](int thread_id, int i) { -#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) + VectorRef(buffer, buffer_size_).setZero(); - double* buffer = buffer_.get() + thread_id * buffer_size_; - const Chunk& chunk = chunks_[i]; - const int e_block_id = bs->rows[chunk.start].cells.front().block_id; - const int e_block_size = bs->cols[e_block_id].size; + typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix + ete(e_block_size, e_block_size); - VectorRef(buffer, buffer_size_).setZero(); + if (D != NULL) { + const typename EigenTypes<kEBlockSize>::ConstVectorRef + diag(D + bs->cols[e_block_id].position, e_block_size); + ete = diag.array().square().matrix().asDiagonal(); + } else { + ete.setZero(); + } - typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix - ete(e_block_size, e_block_size); + FixedArray<double, 8> g(e_block_size); + typename EigenTypes<kEBlockSize>::VectorRef gref(g.get(), e_block_size); + gref.setZero(); - if (D != NULL) { - const typename EigenTypes<kEBlockSize>::ConstVectorRef - diag(D + bs->cols[e_block_id].position, e_block_size); - ete = diag.array().square().matrix().asDiagonal(); - } else { - ete.setZero(); - } + // We are going to be computing + // + // S += F'F - F'E(E'E)^{-1}E'F + // + // for each Chunk. The computation is broken down into a number of + // function calls as below. - FixedArray<double, 8> g(e_block_size); - typename EigenTypes<kEBlockSize>::VectorRef gref(g.get(), e_block_size); - gref.setZero(); + // Compute the outer product of the e_blocks with themselves (ete + // = E'E). Compute the product of the e_blocks with the + // corresonding f_blocks (buffer = E'F), the gradient of the terms + // in this chunk (g) and add the outer product of the f_blocks to + // Schur complement (S += F'F). + ChunkDiagonalBlockAndGradient( + chunk, A, b, chunk.start, &ete, g.get(), buffer, lhs); - // We are going to be computing - // - // S += F'F - F'E(E'E)^{-1}E'F - // - // for each Chunk. The computation is broken down into a number of - // function calls as below. + // Normally one wouldn't compute the inverse explicitly, but + // e_block_size will typically be a small number like 3, in + // which case its much faster to compute the inverse once and + // use it to multiply other matrices/vectors instead of doing a + // Solve call over and over again. + typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix inverse_ete = + InvertPSDMatrix<kEBlockSize>(assume_full_rank_ete_, ete); - // Compute the outer product of the e_blocks with themselves (ete - // = E'E). Compute the product of the e_blocks with the - // corresonding f_blocks (buffer = E'F), the gradient of the terms - // in this chunk (g) and add the outer product of the f_blocks to - // Schur complement (S += F'F). - ChunkDiagonalBlockAndGradient( - chunk, A, b, chunk.start, &ete, g.get(), buffer, lhs); + // For the current chunk compute and update the rhs of the reduced + // linear system. + // + // rhs = F'b - F'E(E'E)^(-1) E'b - // Normally one wouldn't compute the inverse explicitly, but - // e_block_size will typically be a small number like 3, in - // which case its much faster to compute the inverse once and - // use it to multiply other matrices/vectors instead of doing a - // Solve call over and over again. - typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix inverse_ete = - InvertPSDMatrix<kEBlockSize>(assume_full_rank_ete_, ete); + FixedArray<double, 8> inverse_ete_g(e_block_size); + MatrixVectorMultiply<kEBlockSize, kEBlockSize, 0>( + inverse_ete.data(), + e_block_size, + e_block_size, + g.get(), + inverse_ete_g.get()); - // For the current chunk compute and update the rhs of the reduced - // linear system. - // - // rhs = F'b - F'E(E'E)^(-1) E'b + UpdateRhs(chunk, A, b, chunk.start, inverse_ete_g.get(), rhs); - FixedArray<double, 8> inverse_ete_g(e_block_size); - MatrixVectorMultiply<kEBlockSize, kEBlockSize, 0>( - inverse_ete.data(), - e_block_size, - e_block_size, - g.get(), - inverse_ete_g.get()); - - UpdateRhs(chunk, A, b, chunk.start, inverse_ete_g.get(), rhs); - - // S -= F'E(E'E)^{-1}E'F - ChunkOuterProduct( + // S -= F'E(E'E)^{-1}E'F + ChunkOuterProduct( thread_id, bs, inverse_ete, buffer, chunk.buffer_layout, lhs); - } -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - ); -#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) + }); // For rows with no e_blocks, the schur complement update reduces to // S += F'F. @@ -338,21 +309,17 @@ double* y) { const CompressedRowBlockStructure* bs = A->block_structure(); -#ifdef CERES_USE_OPENMP -#pragma omp parallel for num_threads(num_threads_) schedule(dynamic) -#endif // CERES_USE_OPENMP - -#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - for (int i = 0; i < chunks_.size(); ++i) { -#else - ParallelFor(context_, 0, int(chunks_.size()), num_threads_, [&](int i) { -#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)) - + ParallelFor( + context_, + 0, + int(chunks_.size()), + num_threads_, + [&](int i) { const Chunk& chunk = chunks_[i]; const int e_block_id = bs->rows[chunk.start].cells.front().block_id; const int e_block_size = bs->cols[e_block_id].size; - double* y_ptr = y + bs->cols[e_block_id].position; + double* y_ptr = y + bs->cols[e_block_id].position; typename EigenTypes<kEBlockSize>::VectorRef y_block(y_ptr, e_block_size); typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix @@ -395,17 +362,14 @@ MatrixTransposeMatrixMultiply <kRowBlockSize, kEBlockSize, kRowBlockSize, kEBlockSize, 1>( - values + e_cell.position, row.block.size, e_block_size, - values + e_cell.position, row.block.size, e_block_size, - ete.data(), 0, 0, e_block_size, e_block_size); + values + e_cell.position, row.block.size, e_block_size, + values + e_cell.position, row.block.size, e_block_size, + ete.data(), 0, 0, e_block_size, e_block_size); } - y_block = InvertPSDMatrix<kEBlockSize>(assume_full_rank_ete_, ete) - * y_block; - } -#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) - ); -#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS) + y_block = + InvertPSDMatrix<kEBlockSize>(assume_full_rank_ete_, ete) * y_block; + }); } // Update the rhs of the reduced linear system. Compute
diff --git a/jni/Android.mk b/jni/Android.mk index 2c58147..80861dd 100644 --- a/jni/Android.mk +++ b/jni/Android.mk
@@ -178,6 +178,7 @@ $(CERES_SRC_PATH)/low_rank_inverse_hessian.cc \ $(CERES_SRC_PATH)/minimizer.cc \ $(CERES_SRC_PATH)/normal_prior.cc \ + $(CERES_SRC_PATH)/parallel_for_openmp.cc \ $(CERES_SRC_PATH)/parameter_block_ordering.cc \ $(CERES_SRC_PATH)/partitioned_matrix_view.cc \ $(CERES_SRC_PATH)/polynomial.cc \