Single-threaded operations on small vectors

As pointed out by several users, introduction of parallel operations on
vectors severely impacts solver performance on small problems, with time
consumption increasing with the number of threads.

In order to minimize overhead of trying to execute small tasks using a
large number of threads, task scheduling mechanism was changed to avoid
scheduling all tasks at once.

However, there is still a large difference in exectuion time because the
main thread always launches the next thread before starting doing the
work. This leads to several orders of magnitude slowdown when going from
a single-threaded execution (which follows a fast-forward path to a
single loop over all indices, without any synchronization involved)
to a two-thread execution:

/bin/parallel_vector_operations_benchmark
-------------------------------------------
Benchmark                              Time
-------------------------------------------
SetZero/128                         12.8 ns
SetZeroParallel/128/1               16.6 ns
SetZeroParallel/128/2               2211 ns

In order to eliminate this effect, we limit the block-size of parallel
execution of vector operations to 2^16 elements (thus, starting parallel
execution only for vectors of at least 2^17 elements).

Threshold of 2^16 elements was choosen by evaluating thresholds from
2^10 to 2^20 (only powers of 2), with 2^14..2^20 significantly reducing
worst-case runtime degradation.

Details can be found in discussion of the issue at
https://github.com/ceres-solver/ceres-solver/issues/1016

Change-Id: I555c882d63ee53323ceb426743b970f989b65503
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 5bd2792..f9fc241 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -211,9 +211,9 @@
     linear_solver.cc
     low_rank_inverse_hessian.cc
     minimizer.cc
-    parallel_for.cc
     parallel_invoke.cc
     parallel_utils.cc
+    parallel_vector_ops.cc
     parameter_block_ordering.cc
     partitioned_matrix_view.cc
     polynomial.cc
diff --git a/internal/ceres/eigen_vector_ops.h b/internal/ceres/eigen_vector_ops.h
index b7f57f0..6ebff88 100644
--- a/internal/ceres/eigen_vector_ops.h
+++ b/internal/ceres/eigen_vector_ops.h
@@ -34,6 +34,7 @@
 #include <numeric>
 
 #include "ceres/internal/eigen.h"
+#include "ceres/internal/fixed_array.h"
 #include "ceres/parallel_for.h"
 #include "ceres/parallel_vector_ops.h"
 
@@ -46,15 +47,17 @@
 inline double Norm(const Eigen::DenseBase<Derived>& x,
                    ContextImpl* context,
                    int num_threads) {
-  std::vector<double> norms(num_threads);
-  ParallelFor(context,
-              0,
-              x.rows(),
-              num_threads,
-              [&x, &norms](int thread_id, std::tuple<int, int> range) {
-                auto [start, end] = range;
-                norms[thread_id] += x.segment(start, end - start).squaredNorm();
-              });
+  FixedArray<double> norms(num_threads, 0.);
+  ParallelFor(
+      context,
+      0,
+      x.rows(),
+      num_threads,
+      [&x, &norms](int thread_id, std::tuple<int, int> range) {
+        auto [start, end] = range;
+        norms[thread_id] += x.segment(start, end - start).squaredNorm();
+      },
+      kMinBlockSizeParallelVectorOps);
   return std::sqrt(std::accumulate(norms.begin(), norms.end(), 0.));
 }
 inline void SetZero(Vector& x, ContextImpl* context, int num_threads) {
@@ -74,18 +77,20 @@
                   const VectorLikeY& y,
                   ContextImpl* context,
                   int num_threads) {
-  std::vector<double> dots(num_threads);
-  ParallelFor(context,
-              0,
-              x.rows(),
-              num_threads,
-              [&x, &y, &dots](int thread_id, std::tuple<int, int> range) {
-                auto [start, end] = range;
-                const int block_size = end - start;
-                const auto& x_block = x.segment(start, block_size);
-                const auto& y_block = y.segment(start, block_size);
-                dots[thread_id] += x_block.dot(y_block);
-              });
+  FixedArray<double> dots(num_threads, 0.);
+  ParallelFor(
+      context,
+      0,
+      x.rows(),
+      num_threads,
+      [&x, &y, &dots](int thread_id, std::tuple<int, int> range) {
+        auto [start, end] = range;
+        const int block_size = end - start;
+        const auto& x_block = x.segment(start, block_size);
+        const auto& y_block = y.segment(start, block_size);
+        dots[thread_id] += x_block.dot(y_block);
+      },
+      kMinBlockSizeParallelVectorOps);
   return std::accumulate(dots.begin(), dots.end(), 0.);
 }
 inline void Copy(const Vector& from,
diff --git a/internal/ceres/parallel_for.h b/internal/ceres/parallel_for.h
index 549dbe5..11db1fb 100644
--- a/internal/ceres/parallel_for.h
+++ b/internal/ceres/parallel_for.h
@@ -67,20 +67,29 @@
 // When distributing workload between threads, it is assumed that each loop
 // iteration takes approximately equal time to complete.
 template <typename F>
-void ParallelFor(
-    ContextImpl* context, int start, int end, int num_threads, F&& function) {
+void ParallelFor(ContextImpl* context,
+                 int start,
+                 int end,
+                 int num_threads,
+                 F&& function,
+                 int min_block_size = 1) {
   CHECK_GT(num_threads, 0);
   if (start >= end) {
     return;
   }
 
-  if (num_threads == 1 || end - start == 1) {
+  if (num_threads == 1 || end - start < min_block_size * 2) {
     InvokeOnSegment(0, std::make_tuple(start, end), std::forward<F>(function));
     return;
   }
 
   CHECK(context != nullptr);
-  ParallelInvoke(context, start, end, num_threads, std::forward<F>(function));
+  ParallelInvoke(context,
+                 start,
+                 end,
+                 num_threads,
+                 std::forward<F>(function),
+                 min_block_size);
 }
 
 // Execute function for every element in the range [start, end) with at most
diff --git a/internal/ceres/parallel_for_test.cc b/internal/ceres/parallel_for_test.cc
index b18a1c6..58a27d4 100644
--- a/internal/ceres/parallel_for_test.cc
+++ b/internal/ceres/parallel_for_test.cc
@@ -94,6 +94,29 @@
   }
 }
 
+// Tests parallel for loop with ranges and lower bound on minimal range size
+TEST(ParallelForWithRange, MinimalSize) {
+  ContextImpl context;
+  constexpr int kNumThreads = 4;
+  constexpr int kMinBlockSize = 5;
+  context.EnsureMinimumThreads(kNumThreads);
+
+  for (int size = kMinBlockSize; size <= 25; ++size) {
+    std::atomic<bool> failed(false);
+    ParallelFor(
+        &context,
+        0,
+        size,
+        kNumThreads,
+        [&failed, kMinBlockSize](std::tuple<int, int> range) {
+          auto [start, end] = range;
+          if (end - start < kMinBlockSize) failed = true;
+        },
+        kMinBlockSize);
+    EXPECT_EQ(failed, false);
+  }
+}
+
 // Tests the parallel for loop with the thread ID interface computes the correct
 // result for various number of threads.
 TEST(ParallelForWithThreadId, NumThreads) {
diff --git a/internal/ceres/parallel_invoke.h b/internal/ceres/parallel_invoke.h
index 707751b..398f8f2 100644
--- a/internal/ceres/parallel_invoke.h
+++ b/internal/ceres/parallel_invoke.h
@@ -145,9 +145,9 @@
 
 // This implementation uses a fixed size max worker pool with a shared task
 // queue. The problem of executing the function for the interval of [start, end)
-// is broken up into at most num_threads * kWorkBlocksPerThread blocks
-// and added to the thread pool. To avoid deadlocks, the calling thread is
-// allowed to steal work from the worker pool.
+// is broken up into at most num_threads * kWorkBlocksPerThread blocks (each of
+// size at least min_block_size) and added to the thread pool. To avoid
+// deadlocks, the calling thread is allowed to steal work from the worker pool.
 // This is implemented via a shared state between the tasks. In order for
 // the calling thread or thread pool to get a block of work, it will query the
 // shared state for the next block of work to be done. If there is nothing left,
@@ -162,8 +162,12 @@
 // A performance analysis has shown this implementation is on par with OpenMP
 // and TBB.
 template <typename F>
-void ParallelInvoke(
-    ContextImpl* context, int start, int end, int num_threads, F&& function) {
+void ParallelInvoke(ContextImpl* context,
+                    int start,
+                    int end,
+                    int num_threads,
+                    F&& function,
+                    int min_block_size) {
   CHECK(context != nullptr);
 
   // Maximal number of work items scheduled for a single thread
@@ -176,8 +180,8 @@
   //
   // In order to avoid creating empty blocks of work, we need to limit
   // number of work blocks by a total number of indices.
-  const int num_work_blocks =
-      std::min((end - start), num_threads * kWorkBlocksPerThread);
+  const int num_work_blocks = std::min((end - start) / min_block_size,
+                                       num_threads * kWorkBlocksPerThread);
 
   // We use a std::shared_ptr because the main thread can finish all
   // the work before the tasks have been popped off the queue.  So the
diff --git a/internal/ceres/parallel_vector_operations_benchmark.cc b/internal/ceres/parallel_vector_operations_benchmark.cc
index 6fe756a..8b55def 100644
--- a/internal/ceres/parallel_vector_operations_benchmark.cc
+++ b/internal/ceres/parallel_vector_operations_benchmark.cc
@@ -33,20 +33,47 @@
 #include "ceres/parallel_for.h"
 
 namespace ceres::internal {
+// Older versions of benchmark library (for example, one shipped with
+// ubuntu 20.04) do not support range generation and range products
+#define VECTOR_SIZES(num_threads)    \
+      Args({1 << 7, num_threads})    \
+      ->Args({1 << 8, num_threads})  \
+      ->Args({1 << 9, num_threads})  \
+      ->Args({1 << 10, num_threads}) \
+      ->Args({1 << 11, num_threads}) \
+      ->Args({1 << 12, num_threads}) \
+      ->Args({1 << 13, num_threads}) \
+      ->Args({1 << 14, num_threads}) \
+      ->Args({1 << 15, num_threads}) \
+      ->Args({1 << 16, num_threads}) \
+      ->Args({1 << 17, num_threads}) \
+      ->Args({1 << 18, num_threads}) \
+      ->Args({1 << 19, num_threads}) \
+      ->Args({1 << 20, num_threads}) \
+      ->Args({1 << 21, num_threads}) \
+      ->Args({1 << 22, num_threads}) \
+      ->Args({1 << 23, num_threads})
 
-const int kVectorSize = 64 * 1024 * 1024 / sizeof(double);
+#define VECTOR_SIZE_THREADS \
+  VECTOR_SIZES(1)           \
+      ->VECTOR_SIZES(2)     \
+      ->VECTOR_SIZES(4)     \
+      ->VECTOR_SIZES(8)     \
+      ->VECTOR_SIZES(16)
 
 static void SetZero(benchmark::State& state) {
+  const int kVectorSize = static_cast<int>(state.range(0));
   Vector x = Vector::Random(kVectorSize);
   for (auto _ : state) {
     x.setZero();
   }
   CHECK_EQ(x.squaredNorm(), 0.);
 }
-BENCHMARK(SetZero);
+BENCHMARK(SetZero)->VECTOR_SIZES(1);
 
 static void SetZeroParallel(benchmark::State& state) {
-  const int num_threads = static_cast<int>(state.range(0));
+  const int kVectorSize = static_cast<int>(state.range(0));
+  const int num_threads = static_cast<int>(state.range(1));
   ContextImpl context;
   context.EnsureMinimumThreads(num_threads);
 
@@ -56,9 +83,10 @@
   }
   CHECK_EQ(x.squaredNorm(), 0.);
 }
-BENCHMARK(SetZeroParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+BENCHMARK(SetZeroParallel)->VECTOR_SIZE_THREADS;
 
 static void Negate(benchmark::State& state) {
+  const int kVectorSize = static_cast<int>(state.range(0));
   Vector x = Vector::Random(kVectorSize).normalized();
   const Vector x_init = x;
 
@@ -67,10 +95,11 @@
   }
   CHECK((x - x_init).squaredNorm() == 0. || (x + x_init).squaredNorm() == 0);
 }
-BENCHMARK(Negate);
+BENCHMARK(Negate)->VECTOR_SIZES(1);
 
 static void NegateParallel(benchmark::State& state) {
-  const int num_threads = static_cast<int>(state.range(0));
+  const int kVectorSize = static_cast<int>(state.range(0));
+  const int num_threads = static_cast<int>(state.range(1));
   ContextImpl context;
   context.EnsureMinimumThreads(num_threads);
 
@@ -82,9 +111,10 @@
   }
   CHECK((x - x_init).squaredNorm() == 0. || (x + x_init).squaredNorm() == 0);
 }
-BENCHMARK(NegateParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+BENCHMARK(NegateParallel)->VECTOR_SIZE_THREADS;
 
 static void Assign(benchmark::State& state) {
+  const int kVectorSize = static_cast<int>(state.range(0));
   Vector x = Vector::Random(kVectorSize);
   Vector y = Vector(kVectorSize);
   for (auto _ : state) {
@@ -92,10 +122,11 @@
   }
   CHECK_EQ((y - x).squaredNorm(), 0.);
 }
-BENCHMARK(Assign);
+BENCHMARK(Assign)->VECTOR_SIZES(1);
 
 static void AssignParallel(benchmark::State& state) {
-  const int num_threads = static_cast<int>(state.range(0));
+  const int kVectorSize = static_cast<int>(state.range(0));
+  const int num_threads = static_cast<int>(state.range(1));
   ContextImpl context;
   context.EnsureMinimumThreads(num_threads);
 
@@ -107,9 +138,10 @@
   }
   CHECK_EQ((y - x).squaredNorm(), 0.);
 }
-BENCHMARK(AssignParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+BENCHMARK(AssignParallel)->VECTOR_SIZE_THREADS;
 
 static void D2X(benchmark::State& state) {
+  const int kVectorSize = static_cast<int>(state.range(0));
   const Vector x = Vector::Random(kVectorSize);
   const Vector D = Vector::Random(kVectorSize);
   Vector y = Vector::Zero(kVectorSize);
@@ -118,10 +150,11 @@
   }
   CHECK_GT(y.squaredNorm(), 0.);
 }
-BENCHMARK(D2X);
+BENCHMARK(D2X)->VECTOR_SIZES(1);
 
 static void D2XParallel(benchmark::State& state) {
-  const int num_threads = static_cast<int>(state.range(0));
+  const int kVectorSize = static_cast<int>(state.range(0));
+  const int num_threads = static_cast<int>(state.range(1));
   ContextImpl context;
   context.EnsureMinimumThreads(num_threads);
 
@@ -134,9 +167,10 @@
   }
   CHECK_GT(y.squaredNorm(), 0.);
 }
-BENCHMARK(D2XParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+BENCHMARK(D2XParallel)->VECTOR_SIZE_THREADS;
 
 static void DivideSqrt(benchmark::State& state) {
+  const int kVectorSize = static_cast<int>(state.range(0));
   Vector diagonal = Vector::Random(kVectorSize).array().abs();
   const double radius = 0.5;
   for (auto _ : state) {
@@ -144,10 +178,11 @@
   }
   CHECK_GT(diagonal.squaredNorm(), 0.);
 }
-BENCHMARK(DivideSqrt);
+BENCHMARK(DivideSqrt)->VECTOR_SIZES(1);
 
 static void DivideSqrtParallel(benchmark::State& state) {
-  const int num_threads = static_cast<int>(state.range(0));
+  const int kVectorSize = static_cast<int>(state.range(0));
+  const int num_threads = static_cast<int>(state.range(1));
   ContextImpl context;
   context.EnsureMinimumThreads(num_threads);
 
@@ -159,9 +194,10 @@
   }
   CHECK_GT(diagonal.squaredNorm(), 0.);
 }
-BENCHMARK(DivideSqrtParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+BENCHMARK(DivideSqrtParallel)->VECTOR_SIZE_THREADS;
 
 static void Clamp(benchmark::State& state) {
+  const int kVectorSize = static_cast<int>(state.range(0));
   Vector diagonal = Vector::Random(kVectorSize);
   const double min = -0.5;
   const double max = 0.5;
@@ -173,10 +209,11 @@
   CHECK_LE(diagonal.maxCoeff(), 0.5);
   CHECK_GE(diagonal.minCoeff(), -0.5);
 }
-BENCHMARK(Clamp);
+BENCHMARK(Clamp)->VECTOR_SIZES(1);
 
 static void ClampParallel(benchmark::State& state) {
-  const int num_threads = static_cast<int>(state.range(0));
+  const int kVectorSize = static_cast<int>(state.range(0));
+  const int num_threads = static_cast<int>(state.range(1));
   ContextImpl context;
   context.EnsureMinimumThreads(num_threads);
 
@@ -190,9 +227,10 @@
   CHECK_LE(diagonal.maxCoeff(), 0.5);
   CHECK_GE(diagonal.minCoeff(), -0.5);
 }
-BENCHMARK(ClampParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+BENCHMARK(ClampParallel)->VECTOR_SIZE_THREADS;
 
 static void Norm(benchmark::State& state) {
+  const int kVectorSize = static_cast<int>(state.range(0));
   const Vector x = Vector::Random(kVectorSize);
 
   double total = 0.;
@@ -201,10 +239,11 @@
   }
   CHECK_GT(total, 0.);
 }
-BENCHMARK(Norm);
+BENCHMARK(Norm)->VECTOR_SIZES(1);
 
 static void NormParallel(benchmark::State& state) {
-  const int num_threads = static_cast<int>(state.range(0));
+  const int kVectorSize = static_cast<int>(state.range(0));
+  const int num_threads = static_cast<int>(state.range(1));
   ContextImpl context;
   context.EnsureMinimumThreads(num_threads);
 
@@ -216,9 +255,10 @@
   }
   CHECK_GT(total, 0.);
 }
-BENCHMARK(NormParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+BENCHMARK(NormParallel)->VECTOR_SIZE_THREADS;
 
 static void Dot(benchmark::State& state) {
+  const int kVectorSize = static_cast<int>(state.range(0));
   const Vector x = Vector::Random(kVectorSize);
   const Vector y = Vector::Random(kVectorSize);
 
@@ -228,10 +268,11 @@
   }
   CHECK_NE(total, 0.);
 }
-BENCHMARK(Dot);
+BENCHMARK(Dot)->VECTOR_SIZES(1);
 
 static void DotParallel(benchmark::State& state) {
-  const int num_threads = static_cast<int>(state.range(0));
+  const int kVectorSize = static_cast<int>(state.range(0));
+  const int num_threads = static_cast<int>(state.range(1));
   ContextImpl context;
   context.EnsureMinimumThreads(num_threads);
 
@@ -244,9 +285,10 @@
   }
   CHECK_NE(total, 0.);
 }
-BENCHMARK(DotParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+BENCHMARK(DotParallel)->VECTOR_SIZE_THREADS;
 
 static void Axpby(benchmark::State& state) {
+  const int kVectorSize = static_cast<int>(state.range(0));
   const Vector x = Vector::Random(kVectorSize);
   const Vector y = Vector::Random(kVectorSize);
   Vector z = Vector::Zero(kVectorSize);
@@ -258,10 +300,11 @@
   }
   CHECK_GT(z.squaredNorm(), 0.);
 }
-BENCHMARK(Axpby);
+BENCHMARK(Axpby)->VECTOR_SIZES(1);
 
 static void AxpbyParallel(benchmark::State& state) {
-  const int num_threads = static_cast<int>(state.range(0));
+  const int kVectorSize = static_cast<int>(state.range(0));
+  const int num_threads = static_cast<int>(state.range(1));
   ContextImpl context;
   context.EnsureMinimumThreads(num_threads);
 
@@ -276,7 +319,7 @@
   }
   CHECK_GT(z.squaredNorm(), 0.);
 }
-BENCHMARK(AxpbyParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+BENCHMARK(AxpbyParallel)->VECTOR_SIZE_THREADS;
 
 }  // namespace ceres::internal
 
diff --git a/internal/ceres/parallel_for.cc b/internal/ceres/parallel_vector_ops.cc
similarity index 79%
rename from internal/ceres/parallel_for.cc
rename to internal/ceres/parallel_vector_ops.cc
index 133d7ce..2154ece 100644
--- a/internal/ceres/parallel_for.cc
+++ b/internal/ceres/parallel_vector_ops.cc
@@ -25,37 +25,28 @@
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vitus@google.com (Michael Vitus)
+
+#include "ceres/parallel_vector_ops.h"
+
+#include <algorithm>
 
 #include "ceres/parallel_for.h"
 
-#include <algorithm>
-#include <atomic>
-#include <cmath>
-#include <condition_variable>
-#include <memory>
-#include <mutex>
-#include <tuple>
-
-#include "ceres/internal/config.h"
-#include "ceres/parallel_vector_ops.h"
-#include "glog/logging.h"
-
 namespace ceres::internal {
-
 void ParallelSetZero(ContextImpl* context,
                      int num_threads,
                      double* values,
                      int num_values) {
-  ParallelFor(context,
-              0,
-              num_values,
-              num_threads,
-              [values](std::tuple<int, int> range) {
-                auto [start, end] = range;
-                std::fill(values + start, values + end, 0.);
-              });
+  ParallelFor(
+      context,
+      0,
+      num_values,
+      num_threads,
+      [values](std::tuple<int, int> range) {
+        auto [start, end] = range;
+        std::fill(values + start, values + end, 0.);
+      },
+      kMinBlockSizeParallelVectorOps);
 }
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/parallel_vector_ops.h b/internal/ceres/parallel_vector_ops.h
index 20d6dbc..812950a 100644
--- a/internal/ceres/parallel_vector_ops.h
+++ b/internal/ceres/parallel_vector_ops.h
@@ -42,14 +42,16 @@
 
 namespace ceres::internal {
 
+// Lower bound on block size for parallel vector operations.
+// Operations with vectors of less than kMinBlockSizeParallelVectorOps elements
+// will be executed in a single thread.
+constexpr int kMinBlockSizeParallelVectorOps = 1 << 16;
 // Evaluate vector expression in parallel
-// Assuming LhsExpression and RhsExpression are some sort of
-// column-vector expression, assignment lhs = rhs
-// is eavluated over a set of contiguous blocks in parallel.
-// This is expected to work well in the case of vector-based
-// expressions (since they typically do not result into
-// temporaries).
-// This method expects lhs to be size-compatible with rhs
+// Assuming LhsExpression and RhsExpression are some sort of column-vector
+// expression, assignment lhs = rhs is eavluated over a set of contiguous blocks
+// in parallel. This is expected to work well in the case of vector-based
+// expressions (since they typically do not result into temporaries). This
+// method expects lhs to be size-compatible with rhs
 template <typename LhsExpression, typename RhsExpression>
 void ParallelAssign(ContextImpl* context,
                     int num_threads,
@@ -59,15 +61,16 @@
   static_assert(RhsExpression::ColsAtCompileTime == 1);
   CHECK_EQ(lhs.rows(), rhs.rows());
   const int num_rows = lhs.rows();
-  ParallelFor(context,
-              0,
-              num_rows,
-              num_threads,
-              [&lhs, &rhs](const std::tuple<int, int>& range) {
-                auto [start, end] = range;
-                lhs.segment(start, end - start) =
-                    rhs.segment(start, end - start);
-              });
+  ParallelFor(
+      context,
+      0,
+      num_rows,
+      num_threads,
+      [&lhs, &rhs](const std::tuple<int, int>& range) {
+        auto [start, end] = range;
+        lhs.segment(start, end - start) = rhs.segment(start, end - start);
+      },
+      kMinBlockSizeParallelVectorOps);
 }
 
 // Set vector to zero using num_threads