Parallel operations on vectors Main focus of this change is to parallelize remaining operations (most of them are operations on vectors) in code-path utilized with iterative Schur complement. Parallelization is handled using lazy evaluation of Eigen expressions. On linux pc with intel 8176 processor parallelization of vector operations has the following effect: Running ./bin/parallel_vector_operations_benchmark Run on (112 X 3200.32 MHz CPU s) CPU Caches: L1 Data 32 KiB (x56) L1 Instruction 32 KiB (x56) L2 Unified 1024 KiB (x56) L3 Unified 39424 KiB (x2) Load Average: 3.30, 8.41, 11.82 ----------------------------------- Benchmark Time ----------------------------------- SetZero 10009532 ns SetZeroParallel/1 10024139 ns ... SetZeroParallel/16 877606 ns Negate 4978856 ns NegateParallel/1 5145413 ns ... NegateParallel/16 721823 ns Assign 10731408 ns AssignParallel/1 10749944 ns ... AssignParallel/16 1829381 ns D2X 15214399 ns D2XParallel/1 15623245 ns ... D2XParallel/16 2687060 ns DivideSqrt 8220050 ns DivideSqrtParallel/1 9088467 ns ... DivideSqrtParallel/16 905569 ns Clamp 3502010 ns ClampParallel/1 4507897 ns ... ClampParallel/16 759576 ns Norm 4426782 ns NormParallel/1 4442805 ns ... NormParallel/16 430290 ns Dot 9023276 ns DotParallel/1 9031304 ns ... DotParallel/16 1157267 ns Axpby 14608289 ns AxpbyParallel/1 14570825 ns ... AxpbyParallel/16 2672220 ns ----------------------------------- Multi-threading of vector operations in ISC and program evaluation results into the following improvement: Running ./bin/evaluation_benchmark -------------------------------------------------------------------------------------- Benchmark this 2fd81de -------------------------------------------------------------------------------------- Residuals<problem-13682-4456117-pre.txt>/1 4136 ms 4292 ms Residuals<problem-13682-4456117-pre.txt>/2 2919 ms 2670 ms Residuals<problem-13682-4456117-pre.txt>/4 2065 ms 2198 ms Residuals<problem-13682-4456117-pre.txt>/8 1458 ms 1609 ms Residuals<problem-13682-4456117-pre.txt>/16 1152 ms 1227 ms ResidualsAndJacobian<problem-13682-4456117-pre.txt>/1 19759 ms 20084 ms ResidualsAndJacobian<problem-13682-4456117-pre.txt>/2 10921 ms 10977 ms ResidualsAndJacobian<problem-13682-4456117-pre.txt>/4 6220 ms 6941 ms ResidualsAndJacobian<problem-13682-4456117-pre.txt>/8 3490 ms 4398 ms ResidualsAndJacobian<problem-13682-4456117-pre.txt>/16 2277 ms 3172 ms Plus<problem-13682-4456117-pre.txt>/1 339 ms 322 ms Plus<problem-13682-4456117-pre.txt>/2 220 ms Plus<problem-13682-4456117-pre.txt>/4 128 ms Plus<problem-13682-4456117-pre.txt>/8 78.0 ms Plus<problem-13682-4456117-pre.txt>/16 49.8 ms ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/1 2434 ms 2478 ms ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/2 2706 ms 2688 ms ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/4 1430 ms 1548 ms ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/8 742 ms 883 ms ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/16 438 ms 555 ms ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/1 2438 ms 2481 ms ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/2 2565 ms 2790 ms ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/4 1434 ms 1551 ms ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/8 765 ms 892 ms ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/16 435 ms 559 ms JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/1 1278 ms JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/2 1555 ms JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/4 833 ms JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/8 459 ms JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/16 250 ms JacobianScaleColumns<problem-13682-4456117-pre.txt>/1 1468 ms JacobianScaleColumns<problem-13682-4456117-pre.txt>/2 1871 ms JacobianScaleColumns<problem-13682-4456117-pre.txt>/4 957 ms JacobianScaleColumns<problem-13682-4456117-pre.txt>/8 528 ms JacobianScaleColumns<problem-13682-4456117-pre.txt>/16 294 ms End-to-end improvements with bundle_adjuster invoked with ./bin/bundle_adjuster --num_threads 28 --num_iterations 40 \ --linear_solver iterative_schur \ --preconditioner jacobi --input --------------------------------------------- Problem this 2fd81de --------------------------------------------- problem-13682-4456117-pre.txt 508.6 892.7 problem-1778-993923-pre.txt 763.8 1129.9 problem-1723-156502-pre.txt 6.3 14.4 problem-356-226730-pre.txt 76.3 116.2 problem-257-65132-pre.txt 38.6 52.0 Change-Id: Ie31cc5015f13fa479c16ffb5ce48c9b880990d49

commit: b158515089a85be8425db66ed43546605f86a00e [log] [tgz]
author: Dmitriy Korchemkin <dmitriy.korchemkin@gmail.com> Sun Nov 27 21:35:44 2022 +0300
committer: Dmitriy Korchemkin <dmitriy.korchemkin@gmail.com> Sat Dec 17 02:52:27 2022 +0300
tree: 1e88ea96569aa49f06b970d0936e01daf44860d2
parent: 2fd81de12d619f8a32769a6775dc8cb06355aa70 [diff] [blame]
diff --git a/internal/ceres/parallel_for.h b/internal/ceres/parallel_for.h
index d91a195..26a4a79 100644
--- a/internal/ceres/parallel_for.h
+++ b/internal/ceres/parallel_for.h

@@ -38,6 +38,7 @@
 
 #include "ceres/context_impl.h"
 #include "ceres/internal/disable_warnings.h"
+#include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"
 #include "glog/logging.h"
 
@@ -61,7 +62,7 @@
 namespace parallel_for_details {
 // Get arguments of callable as a tuple
 template <typename F, typename... Args>
-std::tuple<Args...> args_of(void (F::*)(Args...) const);
+std::tuple<std::decay_t<Args>...> args_of(void (F::*)(Args...) const);
 
 template <typename F>
 using args_of_t = decltype(args_of(&F::operator()));
@@ -75,24 +76,58 @@
 // For parallel for iterations of type [](int i) -> void
 template <typename F>
 struct InvokeImpl<F, std::tuple<int>> {
-  static void Invoke(int thread_id, int i, const F& function) {
+  static void InvokeOnSegment(int thread_id,
+                              int start,
+                              int end,
+                              const F& function) {
     (void)thread_id;
-    function(i);
+    for (int i = start; i < end; ++i) {
+      function(i);
+    }
   }
 };
 
 // For parallel for iterations of type [](int thread_id, int i) -> void
 template <typename F>
 struct InvokeImpl<F, std::tuple<int, int>> {
-  static void Invoke(int thread_id, int i, const F& function) {
-    function(thread_id, i);
+  static void InvokeOnSegment(int thread_id,
+                              int start,
+                              int end,
+                              const F& function) {
+    for (int i = start; i < end; ++i) {
+      function(thread_id, i);
+    }
+  }
+};
+
+// For parallel for iterations of type [](tuple<int, int> range) -> void
+template <typename F>
+struct InvokeImpl<F, std::tuple<std::tuple<int, int>>> {
+  static void InvokeOnSegment(int thread_id,
+                              int start,
+                              int end,
+                              const F& function) {
+    (void)thread_id;
+    function(std::make_tuple(start, end));
+  }
+};
+
+// For parallel for iterations of type [](int thread_id, tuple<int, int> range)
+// -> void
+template <typename F>
+struct InvokeImpl<F, std::tuple<int, std::tuple<int, int>>> {
+  static void InvokeOnSegment(int thread_id,
+                              int start,
+                              int end,
+                              const F& function) {
+    function(thread_id, std::make_tuple(start, end));
   }
 };
 
 // Invoke function passing thread_id only if required
 template <typename F>
-void Invoke(int thread_id, int i, const F& function) {
-  InvokeImpl<F, args_of_t<F>>::Invoke(thread_id, i, function);
+void InvokeOnSegment(int thread_id, int start, int end, const F& function) {
+  InvokeImpl<F, args_of_t<F>>::InvokeOnSegment(thread_id, start, end, function);
 }
 
 // Check if it is possible to split range [start; end) into at most
@@ -218,7 +253,8 @@
 // implemented by each threading backend
 template <typename F>
 void ParallelInvoke(ContextImpl* context,
-                    int i,
+                    int start,
+                    int end,
                     int num_threads,
                     const F& function);
 
@@ -241,9 +277,7 @@
   }
 
   if (num_threads == 1 || end - start == 1) {
-    for (int i = start; i < end; ++i) {
-      Invoke<F>(0, i, function);
-    }
+    InvokeOnSegment<F>(0, start, end, function);
     return;
   }
 
@@ -293,9 +327,8 @@
                 const int partition_start = partitions[partition_id];
                 const int partition_end = partitions[partition_id + 1];
 
-                for (int i = partition_start; i < partition_end; ++i) {
-                  Invoke<F>(thread_id, i, function);
-                }
+                InvokeOnSegment<F>(
+                    thread_id, partition_start, partition_end, function);
               });
 }
 
@@ -330,11 +363,50 @@
                 const int partition_start = partitions[partition_id];
                 const int partition_end = partitions[partition_id + 1];
 
-                for (int i = partition_start; i < partition_end; ++i) {
-                  Invoke<F>(thread_id, i, function);
-                }
+                InvokeOnSegment<F>(
+                    thread_id, partition_start, partition_end, function);
               });
 }
+
+// Evaluate vector expression in parallel
+// Assuming LhsExpression and RhsExpression are some sort of
+// column-vector expression, assignment lhs = rhs
+// is eavluated over a set of contiguous blocks in parallel.
+// This is expected to work well in the case of vector-based
+// expressions (since they typically do not result into
+// temporaries).
+// This method expects lhs to be size-compatible with rhs
+template <typename LhsExpression, typename RhsExpression>
+void ParallelAssign(ContextImpl* context,
+                    int num_threads,
+                    LhsExpression& lhs,
+                    const RhsExpression& rhs) {
+  static_assert(LhsExpression::ColsAtCompileTime == 1);
+  static_assert(RhsExpression::ColsAtCompileTime == 1);
+  CHECK_EQ(lhs.rows(), rhs.rows());
+  const int num_rows = lhs.rows();
+  ParallelFor(context,
+              0,
+              num_rows,
+              num_threads,
+              [&lhs, &rhs](const std::tuple<int, int>& range) {
+                auto [start, end] = range;
+                lhs.segment(start, end - start) =
+                    rhs.segment(start, end - start);
+              });
+}
+
+// Set vector to zero using num_threads
+template <typename VectorType>
+void ParallelSetZero(ContextImpl* context,
+                     int num_threads,
+                     VectorType& vector) {
+  ParallelSetZero(context, num_threads, vector.data(), vector.rows());
+}
+void ParallelSetZero(ContextImpl* context,
+                     int num_threads,
+                     double* values,
+                     int num_values);
 }  // namespace ceres::internal
 
 // Backend-specific implementations of ParallelInvoke
commit	b158515089a85be8425db66ed43546605f86a00e	[log] [tgz]
author	Dmitriy Korchemkin <dmitriy.korchemkin@gmail.com>	Sun Nov 27 21:35:44 2022 +0300
committer	Dmitriy Korchemkin <dmitriy.korchemkin@gmail.com>	Sat Dec 17 02:52:27 2022 +0300
tree	1e88ea96569aa49f06b970d0936e01daf44860d2
parent	2fd81de12d619f8a32769a6775dc8cb06355aa70 [diff] [blame]