Adds a ParallelFor wrapper for no threads and OpenMP.
With the addition of C++11 support we can simplify the parallel for code by
removing the ifdef branching. Converts coordinate_descent_minimizer.cc to use
the thread_id ParallelFor API.
Tested by building with OpenMP, C++11 threads, TBB, and no threads. All tests
pass.
Also compared timing via the bundle adjuster.
./bin/bundle_adjuster --input=../problem-744-543562-pre.txt
With OpenMP num_threads=8
Head:
Time (in seconds):
Residual only evaluation 0.807753 (5)
Jacobian & residual evaluation 4.489404 (6)
Linear solver 41.826481 (5)
Minimizer 50.745857
Total 73.294424
CL:
Time (in seconds):
Residual only evaluation 0.970483 (5)
Jacobian & residual evaluation 4.647438 (6)
Linear solver 41.781892 (5)
Minimizer 50.848904
Total 73.089983
With OpenMP num_threads=1
HEAD:
Time (in seconds):
Residual only evaluation 2.990246 (5)
Jacobian & residual evaluation 14.132090 (6)
Linear solver 79.631951 (5)
Minimizer 100.281847
Total 122.946267
CL:
Time (in seconds):
Residual only evaluation 3.075178 (5)
Jacobian & residual evaluation 13.966451 (6)
Linear solver 77.005441 (5)
Minimizer 97.568712
Total 120.410454
Change-Id: I1857d7943073be7465b6c6476bf46ab11c5475a3
diff --git a/bazel/ceres.bzl b/bazel/ceres.bzl
index 6ba0137..8963370 100644
--- a/bazel/ceres.bzl
+++ b/bazel/ceres.bzl
@@ -91,6 +91,7 @@
"minimizer.cc",
"normal_prior.cc",
"parallel_for_cxx.cc",
+ "parallel_for_openmp.cc",
"parallel_for_tbb.cc",
"parallel_utils.cc",
"parameter_block_ordering.cc",
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 8924173..1dad396 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -91,6 +91,7 @@
low_rank_inverse_hessian.cc
minimizer.cc
normal_prior.cc
+ parallel_for_openmp.cc
parallel_for_cxx.cc
parallel_for_tbb.cc
parallel_utils.cc
diff --git a/internal/ceres/coordinate_descent_minimizer.cc b/internal/ceres/coordinate_descent_minimizer.cc
index 6f20f20..558faed 100644
--- a/internal/ceres/coordinate_descent_minimizer.cc
+++ b/internal/ceres/coordinate_descent_minimizer.cc
@@ -30,10 +30,6 @@
#include "ceres/coordinate_descent_minimizer.h"
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-#include "ceres/parallel_for.h"
-#endif
-
#include <algorithm>
#include <iterator>
#include <memory>
@@ -43,14 +39,13 @@
#include "ceres/evaluator.h"
#include "ceres/linear_solver.h"
#include "ceres/minimizer.h"
+#include "ceres/parallel_for.h"
#include "ceres/parameter_block.h"
#include "ceres/parameter_block_ordering.h"
#include "ceres/problem_impl.h"
#include "ceres/program.h"
#include "ceres/residual_block.h"
-#include "ceres/scoped_thread_token.h"
#include "ceres/solver.h"
-#include "ceres/thread_token_provider.h"
#include "ceres/trust_region_minimizer.h"
#include "ceres/trust_region_strategy.h"
@@ -164,62 +159,45 @@
evaluator_options_.num_threads =
max(1, options.num_threads / num_inner_iteration_threads);
- ThreadTokenProvider thread_token_provider(num_inner_iteration_threads);
-
-#ifdef CERES_USE_OPENMP
// The parameter blocks in each independent set can be optimized
// in parallel, since they do not co-occur in any residual block.
-#pragma omp parallel for num_threads(num_inner_iteration_threads)
-#endif
+ ParallelFor(
+ context_,
+ independent_set_offsets_[i],
+ independent_set_offsets_[i + 1],
+ num_inner_iteration_threads,
+ [&](int thread_id, int j) {
+ ParameterBlock* parameter_block = parameter_blocks_[j];
+ const int old_index = parameter_block->index();
+ const int old_delta_offset = parameter_block->delta_offset();
+ parameter_block->SetVarying();
+ parameter_block->set_index(0);
+ parameter_block->set_delta_offset(0);
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- for (int j = independent_set_offsets_[i];
- j < independent_set_offsets_[i + 1];
- ++j) {
-#else
- ParallelFor(context_,
- independent_set_offsets_[i],
- independent_set_offsets_[i + 1],
- num_inner_iteration_threads,
- [&](int j) {
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
+ Program inner_program;
+ inner_program.mutable_parameter_blocks()->push_back(parameter_block);
+ *inner_program.mutable_residual_blocks() = residual_blocks_[j];
- const ScopedThreadToken scoped_thread_token(&thread_token_provider);
- const int thread_id = scoped_thread_token.token();
+ // TODO(sameeragarwal): Better error handling. Right now we
+ // assume that this is not going to lead to problems of any
+ // sort. Basically we should be checking for numerical failure
+ // of some sort.
+ //
+ // On the other hand, if the optimization is a failure, that in
+ // some ways is fine, since it won't change the parameters and
+ // we are fine.
+ Solver::Summary inner_summary;
+ Solve(&inner_program,
+ linear_solvers[thread_id],
+ parameters + parameter_block->state_offset(),
+ &inner_summary);
- ParameterBlock* parameter_block = parameter_blocks_[j];
- const int old_index = parameter_block->index();
- const int old_delta_offset = parameter_block->delta_offset();
- parameter_block->SetVarying();
- parameter_block->set_index(0);
- parameter_block->set_delta_offset(0);
-
- Program inner_program;
- inner_program.mutable_parameter_blocks()->push_back(parameter_block);
- *inner_program.mutable_residual_blocks() = residual_blocks_[j];
-
- // TODO(sameeragarwal): Better error handling. Right now we
- // assume that this is not going to lead to problems of any
- // sort. Basically we should be checking for numerical failure
- // of some sort.
- //
- // On the other hand, if the optimization is a failure, that in
- // some ways is fine, since it won't change the parameters and
- // we are fine.
- Solver::Summary inner_summary;
- Solve(&inner_program,
- linear_solvers[thread_id],
- parameters + parameter_block->state_offset(),
- &inner_summary);
-
- parameter_block->set_index(old_index);
- parameter_block->set_delta_offset(old_delta_offset);
- parameter_block->SetState(parameters + parameter_block->state_offset());
- parameter_block->SetConstant();
- }
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
- );
-#endif
+ parameter_block->set_index(old_index);
+ parameter_block->set_delta_offset(old_delta_offset);
+ parameter_block->SetState(parameters +
+ parameter_block->state_offset());
+ parameter_block->SetConstant();
+ });
}
for (int i = 0; i < parameter_blocks_.size(); ++i) {
diff --git a/internal/ceres/covariance_impl.cc b/internal/ceres/covariance_impl.cc
index 2552416..7c63c2c 100644
--- a/internal/ceres/covariance_impl.cc
+++ b/internal/ceres/covariance_impl.cc
@@ -30,10 +30,6 @@
#include "ceres/covariance_impl.h"
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-#include "ceres/parallel_for.h"
-#endif
-
#include <algorithm>
#include <cstdlib>
#include <memory>
@@ -53,13 +49,12 @@
#include "ceres/crs_matrix.h"
#include "ceres/internal/eigen.h"
#include "ceres/map_util.h"
+#include "ceres/parallel_for.h"
#include "ceres/parallel_utils.h"
#include "ceres/parameter_block.h"
#include "ceres/problem_impl.h"
#include "ceres/residual_block.h"
-#include "ceres/scoped_thread_token.h"
#include "ceres/suitesparse.h"
-#include "ceres/thread_token_provider.h"
#include "ceres/wall_time.h"
#include "glog/logging.h"
@@ -344,60 +339,41 @@
bool success = true;
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- ThreadTokenProvider thread_token_provider(num_threads);
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
// Technically the following code is a double nested loop where
// i = 1:n, j = i:n.
int iteration_count = (num_parameters * (num_parameters + 1)) / 2;
-#if defined(CERES_USE_OPENMP)
-# pragma omp parallel for num_threads(num_threads) schedule(dynamic)
-#endif // CERES_USE_OPENMP
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- for (int k = 0; k < iteration_count; ++k) {
-#else
problem_->context()->EnsureMinimumThreads(num_threads);
- ParallelFor(problem_->context(),
- 0,
- iteration_count,
- num_threads,
- [&](int thread_id, int k) {
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- int i, j;
- LinearIndexToUpperTriangularIndex(k, num_parameters, &i, &j);
+ ParallelFor(
+ problem_->context(),
+ 0,
+ iteration_count,
+ num_threads,
+ [&](int thread_id, int k) {
+ int i, j;
+ LinearIndexToUpperTriangularIndex(k, num_parameters, &i, &j);
- int covariance_row_idx = cum_parameter_size[i];
- int covariance_col_idx = cum_parameter_size[j];
- int size_i = parameter_sizes[i];
- int size_j = parameter_sizes[j];
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- const ScopedThreadToken scoped_thread_token(&thread_token_provider);
- const int thread_id = scoped_thread_token.token();
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- double* covariance_block =
- workspace.get() +
- thread_id * max_covariance_block_size * max_covariance_block_size;
- if (!GetCovarianceBlockInTangentOrAmbientSpace(
- parameters[i], parameters[j], lift_covariance_to_ambient_space,
- covariance_block)) {
- success = false;
- }
+ int covariance_row_idx = cum_parameter_size[i];
+ int covariance_col_idx = cum_parameter_size[j];
+ int size_i = parameter_sizes[i];
+ int size_j = parameter_sizes[j];
+ double* covariance_block =
+ workspace.get() + thread_id * max_covariance_block_size *
+ max_covariance_block_size;
+ if (!GetCovarianceBlockInTangentOrAmbientSpace(
+ parameters[i], parameters[j],
+ lift_covariance_to_ambient_space, covariance_block)) {
+ success = false;
+ }
- covariance.block(covariance_row_idx, covariance_col_idx,
- size_i, size_j) =
- MatrixRef(covariance_block, size_i, size_j);
+ covariance.block(covariance_row_idx, covariance_col_idx, size_i,
+ size_j) = MatrixRef(covariance_block, size_i, size_j);
- if (i != j) {
- covariance.block(covariance_col_idx, covariance_row_idx,
- size_j, size_i) =
- MatrixRef(covariance_block, size_i, size_j).transpose();
-
- }
- }
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
- );
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+ if (i != j) {
+ covariance.block(covariance_col_idx, covariance_row_idx,
+ size_j, size_i) =
+ MatrixRef(covariance_block, size_i, size_j).transpose();
+ }
+ });
return success;
}
@@ -709,50 +685,27 @@
const int num_threads = options_.num_threads;
std::unique_ptr<double[]> workspace(new double[num_threads * num_cols]);
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- ThreadTokenProvider thread_token_provider(num_threads);
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
-#ifdef CERES_USE_OPENMP
-#pragma omp parallel for num_threads(num_threads) schedule(dynamic)
-#endif // CERES_USE_OPENMP
-
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- for (int r = 0; r < num_cols; ++r) {
-#else
problem_->context()->EnsureMinimumThreads(num_threads);
- ParallelFor(problem_->context(),
- 0,
- num_cols,
- num_threads,
- [&](int thread_id, int r) {
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
- const int row_begin = rows[r];
- const int row_end = rows[r + 1];
- if (row_end != row_begin) {
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- const ScopedThreadToken scoped_thread_token(&thread_token_provider);
- const int thread_id = scoped_thread_token.token();
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
- double* solution = workspace.get() + thread_id * num_cols;
- SolveRTRWithSparseRHS<SuiteSparse_long>(
- num_cols,
- static_cast<SuiteSparse_long*>(R->i),
- static_cast<SuiteSparse_long*>(R->p),
- static_cast<double*>(R->x),
- inverse_permutation[r],
- solution);
- for (int idx = row_begin; idx < row_end; ++idx) {
- const int c = cols[idx];
- values[idx] = solution[inverse_permutation[c]];
- }
- }
- }
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
- );
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+ ParallelFor(
+ problem_->context(),
+ 0,
+ num_cols,
+ num_threads,
+ [&](int thread_id, int r) {
+ const int row_begin = rows[r];
+ const int row_end = rows[r + 1];
+ if (row_end != row_begin) {
+ double* solution = workspace.get() + thread_id * num_cols;
+ SolveRTRWithSparseRHS<SuiteSparse_long>(
+ num_cols, static_cast<SuiteSparse_long*>(R->i),
+ static_cast<SuiteSparse_long*>(R->p), static_cast<double*>(R->x),
+ inverse_permutation[r], solution);
+ for (int idx = row_begin; idx < row_end; ++idx) {
+ const int c = cols[idx];
+ values[idx] = solution[inverse_permutation[c]];
+ }
+ }
+ });
free(permutation);
cholmod_l_free_sparse(&R, &cc);
@@ -918,54 +871,33 @@
const int num_threads = options_.num_threads;
std::unique_ptr<double[]> workspace(new double[num_threads * num_cols]);
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- ThreadTokenProvider thread_token_provider(num_threads);
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
-#ifdef CERES_USE_OPENMP
-#pragma omp parallel for num_threads(num_threads) schedule(dynamic)
-#endif // CERES_USE_OPENMP
-
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- for (int r = 0; r < num_cols; ++r) {
-#else
problem_->context()->EnsureMinimumThreads(num_threads);
- ParallelFor(problem_->context(),
- 0,
+ ParallelFor(
+ problem_->context(),
+ 0,
+ num_cols,
+ num_threads,
+ [&](int thread_id, int r) {
+ const int row_begin = rows[r];
+ const int row_end = rows[r + 1];
+ if (row_end != row_begin) {
+ double* solution = workspace.get() + thread_id * num_cols;
+ SolveRTRWithSparseRHS<int>(
num_cols,
- num_threads,
- [&](int thread_id, int r) {
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
+ qr_solver.matrixR().innerIndexPtr(),
+ qr_solver.matrixR().outerIndexPtr(),
+ &qr_solver.matrixR().data().value(0),
+ inverse_permutation.indices().coeff(r),
+ solution);
- const int row_begin = rows[r];
- const int row_end = rows[r + 1];
- if (row_end != row_begin) {
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- const ScopedThreadToken scoped_thread_token(&thread_token_provider);
- const int thread_id = scoped_thread_token.token();
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
- double* solution = workspace.get() + thread_id * num_cols;
- SolveRTRWithSparseRHS<int>(
- num_cols,
- qr_solver.matrixR().innerIndexPtr(),
- qr_solver.matrixR().outerIndexPtr(),
- &qr_solver.matrixR().data().value(0),
- inverse_permutation.indices().coeff(r),
- solution);
-
- // Assign the values of the computed covariance using the
- // inverse permutation used in the QR factorization.
- for (int idx = row_begin; idx < row_end; ++idx) {
- const int c = cols[idx];
- values[idx] = solution[inverse_permutation.indices().coeff(c)];
- }
- }
- }
-
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
- );
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+ // Assign the values of the computed covariance using the
+ // inverse permutation used in the QR factorization.
+ for (int idx = row_begin; idx < row_end; ++idx) {
+ const int c = cols[idx];
+ values[idx] = solution[inverse_permutation.indices().coeff(c)];
+ }
+ }
+ });
event_logger.AddEvent("Inverse");
diff --git a/internal/ceres/parallel_for_openmp.cc b/internal/ceres/parallel_for_openmp.cc
new file mode 100644
index 0000000..b64f3d7
--- /dev/null
+++ b/internal/ceres/parallel_for_openmp.cc
@@ -0,0 +1,83 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2018 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vitus@google.com (Michael Vitus)
+
+// This include must come before any #ifndef check on Ceres compile options.
+#include "ceres/internal/port.h"
+
+#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
+
+#include "ceres/parallel_for.h"
+
+#include "ceres/scoped_thread_token.h"
+#include "ceres/thread_token_provider.h"
+#include "glog/logging.h"
+
+namespace ceres {
+namespace internal {
+
+void ParallelFor(ContextImpl* context,
+ int start,
+ int end,
+ int num_threads,
+ const std::function<void(int)>& function) {
+ CHECK_GT(num_threads, 0);
+ CHECK(context != NULL);
+ if (end <= start) {
+ return;
+ }
+
+#ifdef CERES_USE_OPENMP
+#pragma omp parallel for num_threads(num_threads) \
+ schedule(dynamic) if (num_threads > 1)
+#endif // CERES_USE_OPENMP
+ for (int i = start; i < end; ++i) {
+ function(i);
+ }
+}
+
+void ParallelFor(ContextImpl* context,
+ int start,
+ int end,
+ int num_threads,
+ const std::function<void(int thread_id, int i)>& function) {
+ CHECK(context != NULL);
+
+ ThreadTokenProvider thread_token_provider(num_threads);
+ ParallelFor(context, start, end, num_threads, [&](int i) {
+ const ScopedThreadToken scoped_thread_token(&thread_token_provider);
+ const int thread_id = scoped_thread_token.token();
+ function(thread_id, i);
+ });
+}
+
+} // namespace internal
+} // namespace ceres
+
+#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
diff --git a/internal/ceres/parallel_for_test.cc b/internal/ceres/parallel_for_test.cc
index 2de4865..04e5783 100644
--- a/internal/ceres/parallel_for_test.cc
+++ b/internal/ceres/parallel_for_test.cc
@@ -31,8 +31,6 @@
// This include must come before any #ifndef check on Ceres compile options.
#include "ceres/internal/port.h"
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-
#include "ceres/parallel_for.h"
#include <cmath>
@@ -129,6 +127,8 @@
}
}
+// This test is only valid when multithreading support is enabled.
+#ifndef CERES_NO_THREADS
TEST(ParallelForWithThreadId, UniqueThreadIds) {
// Ensure the hardware supports more than 1 thread to ensure the test will
// pass.
@@ -157,8 +157,7 @@
EXPECT_THAT(x, UnorderedElementsAreArray({0,1}));
}
+#endif // CERES_NO_THREADS
} // namespace internal
} // namespace ceres
-
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
diff --git a/internal/ceres/program_evaluator.h b/internal/ceres/program_evaluator.h
index 17175a6..13f2e83 100644
--- a/internal/ceres/program_evaluator.h
+++ b/internal/ceres/program_evaluator.h
@@ -82,25 +82,20 @@
// This include must come before any #ifndef check on Ceres compile options.
#include "ceres/internal/port.h"
+#include <atomic>
#include <map>
#include <memory>
#include <string>
#include <vector>
+
#include "ceres/evaluation_callback.h"
#include "ceres/execution_summary.h"
#include "ceres/internal/eigen.h"
+#include "ceres/parallel_for.h"
#include "ceres/parameter_block.h"
#include "ceres/program.h"
#include "ceres/residual_block.h"
-#include "ceres/scoped_thread_token.h"
#include "ceres/small_blas.h"
-#include "ceres/thread_token_provider.h"
-
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-#include <atomic>
-
-#include "ceres/parallel_for.h"
-#endif
namespace ceres {
namespace internal {
@@ -183,128 +178,84 @@
}
const int num_residual_blocks = program_->NumResidualBlocks();
-
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- ThreadTokenProvider thread_token_provider(options_.num_threads);
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
-#ifdef CERES_USE_OPENMP
- // This bool is used to disable the loop if an error is encountered
- // without breaking out of it. The remaining loop iterations are still run,
- // but with an empty body, and so will finish quickly.
- bool abort = false;
-#pragma omp parallel for num_threads(options_.num_threads)
- for (int i = 0; i < num_residual_blocks; ++i) {
-// Disable the loop instead of breaking, as required by OpenMP.
-#pragma omp flush(abort)
-#endif // CERES_USE_OPENMP
-
-#ifdef CERES_NO_THREADS
- bool abort = false;
- for (int i = 0; i < num_residual_blocks; ++i) {
-#endif // CERES_NO_THREADS
-
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+ // This bool is used to disable the loop if an error is encountered without
+ // breaking out of it. The remaining loop iterations are still run, but with
+ // an empty body, and so will finish quickly.
std::atomic_bool abort(false);
-
- ParallelFor(options_.context,
- 0,
- num_residual_blocks,
- options_.num_threads,
- [&](int thread_id, int i) {
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-
- if (abort) {
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
- return;
-#else
- continue;
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
- }
-
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- const ScopedThreadToken scoped_thread_token(&thread_token_provider);
- const int thread_id = scoped_thread_token.token();
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
- EvaluatePreparer* preparer = &evaluate_preparers_[thread_id];
- EvaluateScratch* scratch = &evaluate_scratch_[thread_id];
-
- // Prepare block residuals if requested.
- const ResidualBlock* residual_block = program_->residual_blocks()[i];
- double* block_residuals = NULL;
- if (residuals != NULL) {
- block_residuals = residuals + residual_layout_[i];
- } else if (gradient != NULL) {
- block_residuals = scratch->residual_block_residuals.get();
- }
-
- // Prepare block jacobians if requested.
- double** block_jacobians = NULL;
- if (jacobian != NULL || gradient != NULL) {
- preparer->Prepare(residual_block,
- i,
- jacobian,
- scratch->jacobian_block_ptrs.get());
- block_jacobians = scratch->jacobian_block_ptrs.get();
- }
-
- // Evaluate the cost, residuals, and jacobians.
- double block_cost;
- if (!residual_block->Evaluate(
- evaluate_options.apply_loss_function,
- &block_cost,
- block_residuals,
- block_jacobians,
- scratch->residual_block_evaluate_scratch.get())) {
- abort = true;
-#ifdef CERES_USE_OPENMP
-// This ensures that the OpenMP threads have a consistent view of 'abort'. Do
-// the flush inside the failure case so that there is usually only one
-// synchronization point per loop iteration instead of two.
-#pragma omp flush(abort)
-#endif // CERES_USE_OPENMP
-
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
- return;
-#else
- continue;
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
- }
-
- scratch->cost += block_cost;
-
- // Store the jacobians, if they were requested.
- if (jacobian != NULL) {
- jacobian_writer_.Write(i,
- residual_layout_[i],
- block_jacobians,
- jacobian);
- }
-
- // Compute and store the gradient, if it was requested.
- if (gradient != NULL) {
- int num_residuals = residual_block->NumResiduals();
- int num_parameter_blocks = residual_block->NumParameterBlocks();
- for (int j = 0; j < num_parameter_blocks; ++j) {
- const ParameterBlock* parameter_block =
- residual_block->parameter_blocks()[j];
- if (parameter_block->IsConstant()) {
- continue;
+ ParallelFor(
+ options_.context,
+ 0,
+ num_residual_blocks,
+ options_.num_threads,
+ [&](int thread_id, int i) {
+ if (abort) {
+ return;
}
- MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
- block_jacobians[j],
- num_residuals,
- parameter_block->LocalSize(),
- block_residuals,
- scratch->gradient.get() + parameter_block->delta_offset());
- }
- }
- }
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
- );
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+ EvaluatePreparer* preparer = &evaluate_preparers_[thread_id];
+ EvaluateScratch* scratch = &evaluate_scratch_[thread_id];
+
+ // Prepare block residuals if requested.
+ const ResidualBlock* residual_block = program_->residual_blocks()[i];
+ double* block_residuals = NULL;
+ if (residuals != NULL) {
+ block_residuals = residuals + residual_layout_[i];
+ } else if (gradient != NULL) {
+ block_residuals = scratch->residual_block_residuals.get();
+ }
+
+ // Prepare block jacobians if requested.
+ double** block_jacobians = NULL;
+ if (jacobian != NULL || gradient != NULL) {
+ preparer->Prepare(residual_block,
+ i,
+ jacobian,
+ scratch->jacobian_block_ptrs.get());
+ block_jacobians = scratch->jacobian_block_ptrs.get();
+ }
+
+ // Evaluate the cost, residuals, and jacobians.
+ double block_cost;
+ if (!residual_block->Evaluate(
+ evaluate_options.apply_loss_function,
+ &block_cost,
+ block_residuals,
+ block_jacobians,
+ scratch->residual_block_evaluate_scratch.get())) {
+ abort = true;
+ return;
+ }
+
+ scratch->cost += block_cost;
+
+ // Store the jacobians, if they were requested.
+ if (jacobian != NULL) {
+ jacobian_writer_.Write(i,
+ residual_layout_[i],
+ block_jacobians,
+ jacobian);
+ }
+
+ // Compute and store the gradient, if it was requested.
+ if (gradient != NULL) {
+ int num_residuals = residual_block->NumResiduals();
+ int num_parameter_blocks = residual_block->NumParameterBlocks();
+ for (int j = 0; j < num_parameter_blocks; ++j) {
+ const ParameterBlock* parameter_block =
+ residual_block->parameter_blocks()[j];
+ if (parameter_block->IsConstant()) {
+ continue;
+ }
+
+ MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+ block_jacobians[j],
+ num_residuals,
+ parameter_block->LocalSize(),
+ block_residuals,
+ scratch->gradient.get() + parameter_block->delta_offset());
+ }
+ }
+ });
if (!abort) {
const int num_parameters = program_->NumEffectiveParameters();
diff --git a/internal/ceres/schur_eliminator_impl.h b/internal/ceres/schur_eliminator_impl.h
index da5d922..203dcc9 100644
--- a/internal/ceres/schur_eliminator_impl.h
+++ b/internal/ceres/schur_eliminator_impl.h
@@ -51,6 +51,7 @@
#include <algorithm>
#include <map>
+#include "Eigen/Dense"
#include "ceres/block_random_access_matrix.h"
#include "ceres/block_sparse_matrix.h"
#include "ceres/block_structure.h"
@@ -58,18 +59,14 @@
#include "ceres/internal/fixed_array.h"
#include "ceres/invert_psd_matrix.h"
#include "ceres/map_util.h"
+#include "ceres/parallel_for.h"
#include "ceres/schur_eliminator.h"
#include "ceres/scoped_thread_token.h"
#include "ceres/small_blas.h"
#include "ceres/stl_util.h"
#include "ceres/thread_token_provider.h"
-#include "Eigen/Dense"
#include "glog/logging.h"
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
-#include "ceres/parallel_for.h"
-#endif
-
namespace ceres {
namespace internal {
@@ -190,43 +187,29 @@
// Add the diagonal to the schur complement.
if (D != NULL) {
-#ifdef CERES_USE_OPENMP
-#pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
-#endif // CERES_USE_OPENMP
+ ParallelFor(
+ context_,
+ num_eliminate_blocks_,
+ num_col_blocks,
+ num_threads_,
+ [&](int i) {
+ const int block_id = i - num_eliminate_blocks_;
+ int r, c, row_stride, col_stride;
+ CellInfo* cell_info = lhs->GetCell(block_id, block_id, &r, &c,
+ &row_stride, &col_stride);
+ if (cell_info != NULL) {
+ const int block_size = bs->cols[i].size;
+ typename EigenTypes<Eigen::Dynamic>::ConstVectorRef diag(
+ D + bs->cols[i].position, block_size);
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- for (int i = num_eliminate_blocks_; i < num_col_blocks; ++i) {
-#else
- ParallelFor(context_, num_eliminate_blocks_, num_col_blocks, num_threads_,
- [&](int i) {
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
- const int block_id = i - num_eliminate_blocks_;
- int r, c, row_stride, col_stride;
- CellInfo* cell_info = lhs->GetCell(block_id, block_id,
- &r, &c,
- &row_stride, &col_stride);
- if (cell_info != NULL) {
- const int block_size = bs->cols[i].size;
- typename EigenTypes<Eigen::Dynamic>::ConstVectorRef
- diag(D + bs->cols[i].position, block_size);
-
- std::lock_guard<std::mutex> l(cell_info->m);
- MatrixRef m(cell_info->values, row_stride, col_stride);
- m.block(r, c, block_size, block_size).diagonal()
- += diag.array().square().matrix();
- }
- }
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
- );
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+ std::lock_guard<std::mutex> l(cell_info->m);
+ MatrixRef m(cell_info->values, row_stride, col_stride);
+ m.block(r, c, block_size, block_size).diagonal() +=
+ diag.array().square().matrix();
+ }
+ });
}
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- ThreadTokenProvider thread_token_provider(num_threads_);
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
-#ifdef CERES_USE_OPENMP
// Eliminate y blocks one chunk at a time. For each chunk, compute
// the entries of the normal equations and the gradient vector block
// corresponding to the y block and then apply Gaussian elimination
@@ -240,88 +223,76 @@
// z blocks that share a row block/residual term with the y
// block. EliminateRowOuterProduct does the corresponding operation
// for the lhs of the reduced linear system.
-#pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
-#endif // CERES_USE_OPENMP
+ ParallelFor(
+ context_,
+ 0,
+ int(chunks_.size()),
+ num_threads_,
+ [&](int thread_id, int i) {
+ double* buffer = buffer_.get() + thread_id * buffer_size_;
+ const Chunk& chunk = chunks_[i];
+ const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
+ const int e_block_size = bs->cols[e_block_id].size;
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- for (int i = 0; i < chunks_.size(); ++i) {
- const ScopedThreadToken scoped_thread_token(&thread_token_provider);
- const int thread_id = scoped_thread_token.token();
-#else
- ParallelFor(context_,
- 0,
- int(chunks_.size()),
- num_threads_,
- [&](int thread_id, int i) {
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
+ VectorRef(buffer, buffer_size_).setZero();
- double* buffer = buffer_.get() + thread_id * buffer_size_;
- const Chunk& chunk = chunks_[i];
- const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
- const int e_block_size = bs->cols[e_block_id].size;
+ typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix
+ ete(e_block_size, e_block_size);
- VectorRef(buffer, buffer_size_).setZero();
+ if (D != NULL) {
+ const typename EigenTypes<kEBlockSize>::ConstVectorRef
+ diag(D + bs->cols[e_block_id].position, e_block_size);
+ ete = diag.array().square().matrix().asDiagonal();
+ } else {
+ ete.setZero();
+ }
- typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix
- ete(e_block_size, e_block_size);
+ FixedArray<double, 8> g(e_block_size);
+ typename EigenTypes<kEBlockSize>::VectorRef gref(g.get(), e_block_size);
+ gref.setZero();
- if (D != NULL) {
- const typename EigenTypes<kEBlockSize>::ConstVectorRef
- diag(D + bs->cols[e_block_id].position, e_block_size);
- ete = diag.array().square().matrix().asDiagonal();
- } else {
- ete.setZero();
- }
+ // We are going to be computing
+ //
+ // S += F'F - F'E(E'E)^{-1}E'F
+ //
+ // for each Chunk. The computation is broken down into a number of
+ // function calls as below.
- FixedArray<double, 8> g(e_block_size);
- typename EigenTypes<kEBlockSize>::VectorRef gref(g.get(), e_block_size);
- gref.setZero();
+ // Compute the outer product of the e_blocks with themselves (ete
+ // = E'E). Compute the product of the e_blocks with the
+ // corresonding f_blocks (buffer = E'F), the gradient of the terms
+ // in this chunk (g) and add the outer product of the f_blocks to
+ // Schur complement (S += F'F).
+ ChunkDiagonalBlockAndGradient(
+ chunk, A, b, chunk.start, &ete, g.get(), buffer, lhs);
- // We are going to be computing
- //
- // S += F'F - F'E(E'E)^{-1}E'F
- //
- // for each Chunk. The computation is broken down into a number of
- // function calls as below.
+ // Normally one wouldn't compute the inverse explicitly, but
+ // e_block_size will typically be a small number like 3, in
+ // which case its much faster to compute the inverse once and
+ // use it to multiply other matrices/vectors instead of doing a
+ // Solve call over and over again.
+ typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix inverse_ete =
+ InvertPSDMatrix<kEBlockSize>(assume_full_rank_ete_, ete);
- // Compute the outer product of the e_blocks with themselves (ete
- // = E'E). Compute the product of the e_blocks with the
- // corresonding f_blocks (buffer = E'F), the gradient of the terms
- // in this chunk (g) and add the outer product of the f_blocks to
- // Schur complement (S += F'F).
- ChunkDiagonalBlockAndGradient(
- chunk, A, b, chunk.start, &ete, g.get(), buffer, lhs);
+ // For the current chunk compute and update the rhs of the reduced
+ // linear system.
+ //
+ // rhs = F'b - F'E(E'E)^(-1) E'b
- // Normally one wouldn't compute the inverse explicitly, but
- // e_block_size will typically be a small number like 3, in
- // which case its much faster to compute the inverse once and
- // use it to multiply other matrices/vectors instead of doing a
- // Solve call over and over again.
- typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix inverse_ete =
- InvertPSDMatrix<kEBlockSize>(assume_full_rank_ete_, ete);
+ FixedArray<double, 8> inverse_ete_g(e_block_size);
+ MatrixVectorMultiply<kEBlockSize, kEBlockSize, 0>(
+ inverse_ete.data(),
+ e_block_size,
+ e_block_size,
+ g.get(),
+ inverse_ete_g.get());
- // For the current chunk compute and update the rhs of the reduced
- // linear system.
- //
- // rhs = F'b - F'E(E'E)^(-1) E'b
+ UpdateRhs(chunk, A, b, chunk.start, inverse_ete_g.get(), rhs);
- FixedArray<double, 8> inverse_ete_g(e_block_size);
- MatrixVectorMultiply<kEBlockSize, kEBlockSize, 0>(
- inverse_ete.data(),
- e_block_size,
- e_block_size,
- g.get(),
- inverse_ete_g.get());
-
- UpdateRhs(chunk, A, b, chunk.start, inverse_ete_g.get(), rhs);
-
- // S -= F'E(E'E)^{-1}E'F
- ChunkOuterProduct(
+ // S -= F'E(E'E)^{-1}E'F
+ ChunkOuterProduct(
thread_id, bs, inverse_ete, buffer, chunk.buffer_layout, lhs);
- }
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
- );
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+ });
// For rows with no e_blocks, the schur complement update reduces to
// S += F'F.
@@ -338,21 +309,17 @@
double* y) {
const CompressedRowBlockStructure* bs = A->block_structure();
-#ifdef CERES_USE_OPENMP
-#pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
-#endif // CERES_USE_OPENMP
-
-#if !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
- for (int i = 0; i < chunks_.size(); ++i) {
-#else
- ParallelFor(context_, 0, int(chunks_.size()), num_threads_, [&](int i) {
-#endif // !(defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS))
-
+ ParallelFor(
+ context_,
+ 0,
+ int(chunks_.size()),
+ num_threads_,
+ [&](int i) {
const Chunk& chunk = chunks_[i];
const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
const int e_block_size = bs->cols[e_block_id].size;
- double* y_ptr = y + bs->cols[e_block_id].position;
+ double* y_ptr = y + bs->cols[e_block_id].position;
typename EigenTypes<kEBlockSize>::VectorRef y_block(y_ptr, e_block_size);
typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix
@@ -395,17 +362,14 @@
MatrixTransposeMatrixMultiply
<kRowBlockSize, kEBlockSize, kRowBlockSize, kEBlockSize, 1>(
- values + e_cell.position, row.block.size, e_block_size,
- values + e_cell.position, row.block.size, e_block_size,
- ete.data(), 0, 0, e_block_size, e_block_size);
+ values + e_cell.position, row.block.size, e_block_size,
+ values + e_cell.position, row.block.size, e_block_size,
+ ete.data(), 0, 0, e_block_size, e_block_size);
}
- y_block = InvertPSDMatrix<kEBlockSize>(assume_full_rank_ete_, ete)
- * y_block;
- }
-#if defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
- );
-#endif // defined(CERES_USE_TBB) || defined(CERES_USE_CXX11_THREADS)
+ y_block =
+ InvertPSDMatrix<kEBlockSize>(assume_full_rank_ete_, ete) * y_block;
+ });
}
// Update the rhs of the reduced linear system. Compute
diff --git a/jni/Android.mk b/jni/Android.mk
index 2c58147..80861dd 100644
--- a/jni/Android.mk
+++ b/jni/Android.mk
@@ -178,6 +178,7 @@
$(CERES_SRC_PATH)/low_rank_inverse_hessian.cc \
$(CERES_SRC_PATH)/minimizer.cc \
$(CERES_SRC_PATH)/normal_prior.cc \
+ $(CERES_SRC_PATH)/parallel_for_openmp.cc \
$(CERES_SRC_PATH)/parameter_block_ordering.cc \
$(CERES_SRC_PATH)/partitioned_matrix_view.cc \
$(CERES_SRC_PATH)/polynomial.cc \