Parallel operations on vectors

Main focus of this change is to parallelize remaining operations (most of them
are operations on vectors) in code-path utilized with iterative Schur
complement.

Parallelization is handled using lazy evaluation of Eigen expressions.

On linux pc with intel 8176 processor parallelization of vector operations has
the following effect:

Running ./bin/parallel_vector_operations_benchmark
Run on (112 X 3200.32 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x56)
  L1 Instruction 32 KiB (x56)
  L2 Unified 1024 KiB (x56)
  L3 Unified 39424 KiB (x2)
Load Average: 3.30, 8.41, 11.82
-----------------------------------
Benchmark                      Time
-----------------------------------
SetZero                 10009532 ns
SetZeroParallel/1       10024139 ns
...
SetZeroParallel/16        877606 ns

Negate                   4978856 ns
NegateParallel/1         5145413 ns
...
NegateParallel/16         721823 ns

Assign                  10731408 ns
AssignParallel/1        10749944 ns
...
AssignParallel/16        1829381 ns

D2X                     15214399 ns
D2XParallel/1           15623245 ns
...
D2XParallel/16           2687060 ns

DivideSqrt               8220050 ns
DivideSqrtParallel/1     9088467 ns
...
DivideSqrtParallel/16     905569 ns

Clamp                    3502010 ns
ClampParallel/1          4507897 ns
...
ClampParallel/16          759576 ns

Norm                     4426782 ns
NormParallel/1           4442805 ns
...
NormParallel/16           430290 ns

Dot                      9023276 ns
DotParallel/1            9031304 ns
...
DotParallel/16           1157267 ns

Axpby                   14608289 ns
AxpbyParallel/1         14570825 ns
...
AxpbyParallel/16         2672220 ns
-----------------------------------

Multi-threading of vector operations in ISC and program evaluation results into
the following improvement:

Running ./bin/evaluation_benchmark
--------------------------------------------------------------------------------------
Benchmark                                                               this   2fd81de
--------------------------------------------------------------------------------------
Residuals<problem-13682-4456117-pre.txt>/1                           4136 ms   4292 ms
Residuals<problem-13682-4456117-pre.txt>/2                           2919 ms   2670 ms
Residuals<problem-13682-4456117-pre.txt>/4                           2065 ms   2198 ms
Residuals<problem-13682-4456117-pre.txt>/8                           1458 ms   1609 ms
Residuals<problem-13682-4456117-pre.txt>/16                          1152 ms   1227 ms

ResidualsAndJacobian<problem-13682-4456117-pre.txt>/1               19759 ms  20084 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/2               10921 ms  10977 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/4                6220 ms   6941 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/8                3490 ms   4398 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/16               2277 ms   3172 ms

Plus<problem-13682-4456117-pre.txt>/1                                 339 ms    322 ms
Plus<problem-13682-4456117-pre.txt>/2                                 220 ms
Plus<problem-13682-4456117-pre.txt>/4                                 128 ms
Plus<problem-13682-4456117-pre.txt>/8                                78.0 ms
Plus<problem-13682-4456117-pre.txt>/16                               49.8 ms

ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/1       2434 ms   2478 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/2       2706 ms   2688 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/4       1430 ms   1548 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/8        742 ms    883 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/16       438 ms    555 ms

ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/1   2438 ms   2481 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/2   2565 ms   2790 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/4   1434 ms   1551 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/8    765 ms    892 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/16   435 ms    559 ms

JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/1           1278 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/2           1555 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/4            833 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/8            459 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/16           250 ms

JacobianScaleColumns<problem-13682-4456117-pre.txt>/1                1468 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/2                1871 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/4                 957 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/8                 528 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/16                294 ms

End-to-end improvements with bundle_adjuster invoked with
./bin/bundle_adjuster --num_threads 28 --num_iterations 40 \
                      --linear_solver iterative_schur \
                      --preconditioner jacobi --input
---------------------------------------------
Problem                         this  2fd81de
---------------------------------------------
problem-13682-4456117-pre.txt  508.6    892.7
problem-1778-993923-pre.txt    763.8   1129.9
problem-1723-156502-pre.txt      6.3     14.4
problem-356-226730-pre.txt      76.3    116.2
problem-257-65132-pre.txt       38.6     52.0

Change-Id: Ie31cc5015f13fa479c16ffb5ce48c9b880990d49
diff --git a/internal/ceres/parallel_for.h b/internal/ceres/parallel_for.h
index d91a195..26a4a79 100644
--- a/internal/ceres/parallel_for.h
+++ b/internal/ceres/parallel_for.h
@@ -38,6 +38,7 @@
 
 #include "ceres/context_impl.h"
 #include "ceres/internal/disable_warnings.h"
+#include "ceres/internal/eigen.h"
 #include "ceres/internal/export.h"
 #include "glog/logging.h"
 
@@ -61,7 +62,7 @@
 namespace parallel_for_details {
 // Get arguments of callable as a tuple
 template <typename F, typename... Args>
-std::tuple<Args...> args_of(void (F::*)(Args...) const);
+std::tuple<std::decay_t<Args>...> args_of(void (F::*)(Args...) const);
 
 template <typename F>
 using args_of_t = decltype(args_of(&F::operator()));
@@ -75,24 +76,58 @@
 // For parallel for iterations of type [](int i) -> void
 template <typename F>
 struct InvokeImpl<F, std::tuple<int>> {
-  static void Invoke(int thread_id, int i, const F& function) {
+  static void InvokeOnSegment(int thread_id,
+                              int start,
+                              int end,
+                              const F& function) {
     (void)thread_id;
-    function(i);
+    for (int i = start; i < end; ++i) {
+      function(i);
+    }
   }
 };
 
 // For parallel for iterations of type [](int thread_id, int i) -> void
 template <typename F>
 struct InvokeImpl<F, std::tuple<int, int>> {
-  static void Invoke(int thread_id, int i, const F& function) {
-    function(thread_id, i);
+  static void InvokeOnSegment(int thread_id,
+                              int start,
+                              int end,
+                              const F& function) {
+    for (int i = start; i < end; ++i) {
+      function(thread_id, i);
+    }
+  }
+};
+
+// For parallel for iterations of type [](tuple<int, int> range) -> void
+template <typename F>
+struct InvokeImpl<F, std::tuple<std::tuple<int, int>>> {
+  static void InvokeOnSegment(int thread_id,
+                              int start,
+                              int end,
+                              const F& function) {
+    (void)thread_id;
+    function(std::make_tuple(start, end));
+  }
+};
+
+// For parallel for iterations of type [](int thread_id, tuple<int, int> range)
+// -> void
+template <typename F>
+struct InvokeImpl<F, std::tuple<int, std::tuple<int, int>>> {
+  static void InvokeOnSegment(int thread_id,
+                              int start,
+                              int end,
+                              const F& function) {
+    function(thread_id, std::make_tuple(start, end));
   }
 };
 
 // Invoke function passing thread_id only if required
 template <typename F>
-void Invoke(int thread_id, int i, const F& function) {
-  InvokeImpl<F, args_of_t<F>>::Invoke(thread_id, i, function);
+void InvokeOnSegment(int thread_id, int start, int end, const F& function) {
+  InvokeImpl<F, args_of_t<F>>::InvokeOnSegment(thread_id, start, end, function);
 }
 
 // Check if it is possible to split range [start; end) into at most
@@ -218,7 +253,8 @@
 // implemented by each threading backend
 template <typename F>
 void ParallelInvoke(ContextImpl* context,
-                    int i,
+                    int start,
+                    int end,
                     int num_threads,
                     const F& function);
 
@@ -241,9 +277,7 @@
   }
 
   if (num_threads == 1 || end - start == 1) {
-    for (int i = start; i < end; ++i) {
-      Invoke<F>(0, i, function);
-    }
+    InvokeOnSegment<F>(0, start, end, function);
     return;
   }
 
@@ -293,9 +327,8 @@
                 const int partition_start = partitions[partition_id];
                 const int partition_end = partitions[partition_id + 1];
 
-                for (int i = partition_start; i < partition_end; ++i) {
-                  Invoke<F>(thread_id, i, function);
-                }
+                InvokeOnSegment<F>(
+                    thread_id, partition_start, partition_end, function);
               });
 }
 
@@ -330,11 +363,50 @@
                 const int partition_start = partitions[partition_id];
                 const int partition_end = partitions[partition_id + 1];
 
-                for (int i = partition_start; i < partition_end; ++i) {
-                  Invoke<F>(thread_id, i, function);
-                }
+                InvokeOnSegment<F>(
+                    thread_id, partition_start, partition_end, function);
               });
 }
+
+// Evaluate vector expression in parallel
+// Assuming LhsExpression and RhsExpression are some sort of
+// column-vector expression, assignment lhs = rhs
+// is eavluated over a set of contiguous blocks in parallel.
+// This is expected to work well in the case of vector-based
+// expressions (since they typically do not result into
+// temporaries).
+// This method expects lhs to be size-compatible with rhs
+template <typename LhsExpression, typename RhsExpression>
+void ParallelAssign(ContextImpl* context,
+                    int num_threads,
+                    LhsExpression& lhs,
+                    const RhsExpression& rhs) {
+  static_assert(LhsExpression::ColsAtCompileTime == 1);
+  static_assert(RhsExpression::ColsAtCompileTime == 1);
+  CHECK_EQ(lhs.rows(), rhs.rows());
+  const int num_rows = lhs.rows();
+  ParallelFor(context,
+              0,
+              num_rows,
+              num_threads,
+              [&lhs, &rhs](const std::tuple<int, int>& range) {
+                auto [start, end] = range;
+                lhs.segment(start, end - start) =
+                    rhs.segment(start, end - start);
+              });
+}
+
+// Set vector to zero using num_threads
+template <typename VectorType>
+void ParallelSetZero(ContextImpl* context,
+                     int num_threads,
+                     VectorType& vector) {
+  ParallelSetZero(context, num_threads, vector.data(), vector.rows());
+}
+void ParallelSetZero(ContextImpl* context,
+                     int num_threads,
+                     double* values,
+                     int num_values);
 }  // namespace ceres::internal
 
 // Backend-specific implementations of ParallelInvoke