Parallel operations on vectors
Main focus of this change is to parallelize remaining operations (most of them
are operations on vectors) in code-path utilized with iterative Schur
complement.
Parallelization is handled using lazy evaluation of Eigen expressions.
On linux pc with intel 8176 processor parallelization of vector operations has
the following effect:
Running ./bin/parallel_vector_operations_benchmark
Run on (112 X 3200.32 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x56)
L1 Instruction 32 KiB (x56)
L2 Unified 1024 KiB (x56)
L3 Unified 39424 KiB (x2)
Load Average: 3.30, 8.41, 11.82
-----------------------------------
Benchmark Time
-----------------------------------
SetZero 10009532 ns
SetZeroParallel/1 10024139 ns
...
SetZeroParallel/16 877606 ns
Negate 4978856 ns
NegateParallel/1 5145413 ns
...
NegateParallel/16 721823 ns
Assign 10731408 ns
AssignParallel/1 10749944 ns
...
AssignParallel/16 1829381 ns
D2X 15214399 ns
D2XParallel/1 15623245 ns
...
D2XParallel/16 2687060 ns
DivideSqrt 8220050 ns
DivideSqrtParallel/1 9088467 ns
...
DivideSqrtParallel/16 905569 ns
Clamp 3502010 ns
ClampParallel/1 4507897 ns
...
ClampParallel/16 759576 ns
Norm 4426782 ns
NormParallel/1 4442805 ns
...
NormParallel/16 430290 ns
Dot 9023276 ns
DotParallel/1 9031304 ns
...
DotParallel/16 1157267 ns
Axpby 14608289 ns
AxpbyParallel/1 14570825 ns
...
AxpbyParallel/16 2672220 ns
-----------------------------------
Multi-threading of vector operations in ISC and program evaluation results into
the following improvement:
Running ./bin/evaluation_benchmark
--------------------------------------------------------------------------------------
Benchmark this 2fd81de
--------------------------------------------------------------------------------------
Residuals<problem-13682-4456117-pre.txt>/1 4136 ms 4292 ms
Residuals<problem-13682-4456117-pre.txt>/2 2919 ms 2670 ms
Residuals<problem-13682-4456117-pre.txt>/4 2065 ms 2198 ms
Residuals<problem-13682-4456117-pre.txt>/8 1458 ms 1609 ms
Residuals<problem-13682-4456117-pre.txt>/16 1152 ms 1227 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/1 19759 ms 20084 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/2 10921 ms 10977 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/4 6220 ms 6941 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/8 3490 ms 4398 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/16 2277 ms 3172 ms
Plus<problem-13682-4456117-pre.txt>/1 339 ms 322 ms
Plus<problem-13682-4456117-pre.txt>/2 220 ms
Plus<problem-13682-4456117-pre.txt>/4 128 ms
Plus<problem-13682-4456117-pre.txt>/8 78.0 ms
Plus<problem-13682-4456117-pre.txt>/16 49.8 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/1 2434 ms 2478 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/2 2706 ms 2688 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/4 1430 ms 1548 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/8 742 ms 883 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/16 438 ms 555 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/1 2438 ms 2481 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/2 2565 ms 2790 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/4 1434 ms 1551 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/8 765 ms 892 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/16 435 ms 559 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/1 1278 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/2 1555 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/4 833 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/8 459 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/16 250 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/1 1468 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/2 1871 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/4 957 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/8 528 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/16 294 ms
End-to-end improvements with bundle_adjuster invoked with
./bin/bundle_adjuster --num_threads 28 --num_iterations 40 \
--linear_solver iterative_schur \
--preconditioner jacobi --input
---------------------------------------------
Problem this 2fd81de
---------------------------------------------
problem-13682-4456117-pre.txt 508.6 892.7
problem-1778-993923-pre.txt 763.8 1129.9
problem-1723-156502-pre.txt 6.3 14.4
problem-356-226730-pre.txt 76.3 116.2
problem-257-65132-pre.txt 38.6 52.0
Change-Id: Ie31cc5015f13fa479c16ffb5ce48c9b880990d49
diff --git a/internal/ceres/parallel_for.h b/internal/ceres/parallel_for.h
index d91a195..26a4a79 100644
--- a/internal/ceres/parallel_for.h
+++ b/internal/ceres/parallel_for.h
@@ -38,6 +38,7 @@
#include "ceres/context_impl.h"
#include "ceres/internal/disable_warnings.h"
+#include "ceres/internal/eigen.h"
#include "ceres/internal/export.h"
#include "glog/logging.h"
@@ -61,7 +62,7 @@
namespace parallel_for_details {
// Get arguments of callable as a tuple
template <typename F, typename... Args>
-std::tuple<Args...> args_of(void (F::*)(Args...) const);
+std::tuple<std::decay_t<Args>...> args_of(void (F::*)(Args...) const);
template <typename F>
using args_of_t = decltype(args_of(&F::operator()));
@@ -75,24 +76,58 @@
// For parallel for iterations of type [](int i) -> void
template <typename F>
struct InvokeImpl<F, std::tuple<int>> {
- static void Invoke(int thread_id, int i, const F& function) {
+ static void InvokeOnSegment(int thread_id,
+ int start,
+ int end,
+ const F& function) {
(void)thread_id;
- function(i);
+ for (int i = start; i < end; ++i) {
+ function(i);
+ }
}
};
// For parallel for iterations of type [](int thread_id, int i) -> void
template <typename F>
struct InvokeImpl<F, std::tuple<int, int>> {
- static void Invoke(int thread_id, int i, const F& function) {
- function(thread_id, i);
+ static void InvokeOnSegment(int thread_id,
+ int start,
+ int end,
+ const F& function) {
+ for (int i = start; i < end; ++i) {
+ function(thread_id, i);
+ }
+ }
+};
+
+// For parallel for iterations of type [](tuple<int, int> range) -> void
+template <typename F>
+struct InvokeImpl<F, std::tuple<std::tuple<int, int>>> {
+ static void InvokeOnSegment(int thread_id,
+ int start,
+ int end,
+ const F& function) {
+ (void)thread_id;
+ function(std::make_tuple(start, end));
+ }
+};
+
+// For parallel for iterations of type [](int thread_id, tuple<int, int> range)
+// -> void
+template <typename F>
+struct InvokeImpl<F, std::tuple<int, std::tuple<int, int>>> {
+ static void InvokeOnSegment(int thread_id,
+ int start,
+ int end,
+ const F& function) {
+ function(thread_id, std::make_tuple(start, end));
}
};
// Invoke function passing thread_id only if required
template <typename F>
-void Invoke(int thread_id, int i, const F& function) {
- InvokeImpl<F, args_of_t<F>>::Invoke(thread_id, i, function);
+void InvokeOnSegment(int thread_id, int start, int end, const F& function) {
+ InvokeImpl<F, args_of_t<F>>::InvokeOnSegment(thread_id, start, end, function);
}
// Check if it is possible to split range [start; end) into at most
@@ -218,7 +253,8 @@
// implemented by each threading backend
template <typename F>
void ParallelInvoke(ContextImpl* context,
- int i,
+ int start,
+ int end,
int num_threads,
const F& function);
@@ -241,9 +277,7 @@
}
if (num_threads == 1 || end - start == 1) {
- for (int i = start; i < end; ++i) {
- Invoke<F>(0, i, function);
- }
+ InvokeOnSegment<F>(0, start, end, function);
return;
}
@@ -293,9 +327,8 @@
const int partition_start = partitions[partition_id];
const int partition_end = partitions[partition_id + 1];
- for (int i = partition_start; i < partition_end; ++i) {
- Invoke<F>(thread_id, i, function);
- }
+ InvokeOnSegment<F>(
+ thread_id, partition_start, partition_end, function);
});
}
@@ -330,11 +363,50 @@
const int partition_start = partitions[partition_id];
const int partition_end = partitions[partition_id + 1];
- for (int i = partition_start; i < partition_end; ++i) {
- Invoke<F>(thread_id, i, function);
- }
+ InvokeOnSegment<F>(
+ thread_id, partition_start, partition_end, function);
});
}
+
+// Evaluate vector expression in parallel
+// Assuming LhsExpression and RhsExpression are some sort of
+// column-vector expression, assignment lhs = rhs
+// is eavluated over a set of contiguous blocks in parallel.
+// This is expected to work well in the case of vector-based
+// expressions (since they typically do not result into
+// temporaries).
+// This method expects lhs to be size-compatible with rhs
+template <typename LhsExpression, typename RhsExpression>
+void ParallelAssign(ContextImpl* context,
+ int num_threads,
+ LhsExpression& lhs,
+ const RhsExpression& rhs) {
+ static_assert(LhsExpression::ColsAtCompileTime == 1);
+ static_assert(RhsExpression::ColsAtCompileTime == 1);
+ CHECK_EQ(lhs.rows(), rhs.rows());
+ const int num_rows = lhs.rows();
+ ParallelFor(context,
+ 0,
+ num_rows,
+ num_threads,
+ [&lhs, &rhs](const std::tuple<int, int>& range) {
+ auto [start, end] = range;
+ lhs.segment(start, end - start) =
+ rhs.segment(start, end - start);
+ });
+}
+
+// Set vector to zero using num_threads
+template <typename VectorType>
+void ParallelSetZero(ContextImpl* context,
+ int num_threads,
+ VectorType& vector) {
+ ParallelSetZero(context, num_threads, vector.data(), vector.rows());
+}
+void ParallelSetZero(ContextImpl* context,
+ int num_threads,
+ double* values,
+ int num_values);
} // namespace ceres::internal
// Backend-specific implementations of ParallelInvoke