diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 0436178..5bd2792 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -589,6 +589,9 @@
   add_executable(parallel_vector_operations_benchmark parallel_vector_operations_benchmark.cc)
   add_dependencies_to_benchmark(parallel_vector_operations_benchmark)
 
+  add_executable(parallel_for_benchmark parallel_for_benchmark.cc)
+  add_dependencies_to_benchmark(parallel_for_benchmark)
+
   add_executable(block_jacobi_preconditioner_benchmark
     block_jacobi_preconditioner_benchmark.cc)
   add_dependencies_to_benchmark(block_jacobi_preconditioner_benchmark)
diff --git a/internal/ceres/parallel_for_benchmark.cc b/internal/ceres/parallel_for_benchmark.cc
new file mode 100644
index 0000000..f1cd0d9
--- /dev/null
+++ b/internal/ceres/parallel_for_benchmark.cc
@@ -0,0 +1,73 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+#include "benchmark/benchmark.h"
+#include "ceres/parallel_for.h"
+
+namespace ceres::internal {
+
+// Parallel for with very small amount of work per iteration and small amount of
+// iterations benchmarks performance of task scheduling
+static void SchedulerBenchmark(benchmark::State& state) {
+  const int vector_size = static_cast<int>(state.range(0));
+  const int num_threads = static_cast<int>(state.range(1));
+  ContextImpl context;
+  context.EnsureMinimumThreads(num_threads);
+
+  Vector x = Vector::Random(vector_size);
+  for (auto _ : state) {
+    ParallelFor(
+        &context, 0, vector_size, num_threads, [&x](int id) { x[id] = 0.; });
+  }
+  CHECK_EQ(x.squaredNorm(), 0.);
+}
+BENCHMARK(SchedulerBenchmark)
+    ->Args({128, 1})
+    ->Args({128, 2})
+    ->Args({128, 4})
+    ->Args({128, 8})
+    ->Args({128, 16})
+    ->Args({256, 1})
+    ->Args({256, 2})
+    ->Args({256, 4})
+    ->Args({256, 8})
+    ->Args({256, 16})
+    ->Args({1024, 1})
+    ->Args({1024, 2})
+    ->Args({1024, 4})
+    ->Args({1024, 8})
+    ->Args({1024, 16})
+    ->Args({4096, 1})
+    ->Args({4096, 2})
+    ->Args({4096, 4})
+    ->Args({4096, 8})
+    ->Args({4096, 16});
+
+}  // namespace ceres::internal
+
+BENCHMARK_MAIN();
diff --git a/internal/ceres/parallel_invoke.h b/internal/ceres/parallel_invoke.h
index 3e67e37..707751b 100644
--- a/internal/ceres/parallel_invoke.h
+++ b/internal/ceres/parallel_invoke.h
@@ -185,8 +185,10 @@
   auto shared_state =
       std::make_shared<ParallelInvokeState>(start, end, num_work_blocks);
 
-  // A function which tries to perform several chunks of work.
-  auto task = [shared_state, num_threads, &function]() {
+  // A function which tries to schedule another task in the thread pool and
+  // perform several chunks of work. Function expects itself as the argument in
+  // order to schedule next task in the thread pool.
+  auto task = [context, shared_state, num_threads, &function](auto& task_copy) {
     int num_jobs_finished = 0;
     const int thread_id = shared_state->thread_id.fetch_add(1);
     // In order to avoid dead-locks in nested parallel for loops, task() will be
@@ -197,11 +199,21 @@
     //  the last task being executed will be terminated here in order to avoid
     //  having more than num_threads active threads
     if (thread_id >= num_threads) return;
+    const int num_work_blocks = shared_state->num_work_blocks;
+    if (thread_id + 1 < num_threads &&
+        shared_state->block_id < num_work_blocks) {
+      // Add another thread to the thread pool.
+      // Note we are taking the task as value so the copy of shared_state shared
+      // pointer (captured by value at declaration of task lambda-function) is
+      // copied and the ref count is increased. This is to prevent it from being
+      // deleted when the main thread finishes all the work and exits before the
+      // threads finish.
+      context->thread_pool.AddTask([task_copy]() { task_copy(task_copy); });
+    }
 
     const int start = shared_state->start;
     const int base_block_size = shared_state->base_block_size;
     const int num_base_p1_sized_blocks = shared_state->num_base_p1_sized_blocks;
-    const int num_work_blocks = shared_state->num_work_blocks;
 
     while (true) {
       // Get the next available chunk of work to be performed. If there is no
@@ -242,20 +254,10 @@
     shared_state->block_until_finished.Finished(num_jobs_finished);
   };
 
-  // Add all the tasks to the thread pool.
-  for (int i = 0; i < num_threads; ++i) {
-    // Note we are taking the task as value so the copy of shared_state shared
-    // pointer (captured by value at declaration of task lambda-function) is
-    // copied and the ref count is increased. This is to prevent it from being
-    // deleted when the main thread finishes all the work and exits before the
-    // threads finish.
-    context->thread_pool.AddTask([task]() { task(); });
-  }
-
-  // Try to do any available work on the main thread. This may steal work from
-  // the thread pool, but when there is no work left the thread pool tasks
-  // will be no-ops.
-  task();
+  // Start scheduling threads and doing work. We might end up with less threads
+  // scheduled than expected, if scheduling overhead is larger than the amount
+  // of work to be done.
+  task(task);
 
   // Wait until all tasks have finished.
   shared_state->block_until_finished.Block();
