Introduce benchmark for Jet operations

Run on (20 X 4300 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x10)
L1 Instruction 32 KiB (x10)
L2 Unified 1024 KiB (x10)
L3 Unified 14080 KiB (x1)
Load Average: 2.37, 3.00, 3.08
-----------------------------------------------------------------------------
Benchmark                                   Time             CPU   Iterations
-----------------------------------------------------------------------------
Addition<3>/1000                         2872 ns         2872 ns       240449
Addition<10>/1000                        5304 ns         5304 ns       100000
Addition<15>/1000                        8211 ns         8210 ns        78742
Addition<25>/1000                       14214 ns        14213 ns        46762
Addition<32>/1000                       13746 ns        13746 ns        50892
Addition<200>/160                       41228 ns        41228 ns        17183
AdditionScalar<3>/1000                   2514 ns         2514 ns       273996
AdditionScalar<10>/1000                  2733 ns         2733 ns       255508
AdditionScalar<15>/1000                  2622 ns         2622 ns       264291
AdditionScalar<25>/1000                  3753 ns         3753 ns       183508
AdditionScalar<32>/1000                  4254 ns         4254 ns       167016
AdditionScalar<200>/160                 18314 ns        18314 ns        38116
Subtraction<3>/1000                      3241 ns         3241 ns       206370
Subtraction<10>/1000                     5023 ns         5023 ns       139271
Subtraction<15>/1000                     8387 ns         8386 ns        89927
Subtraction<25>/1000                    14951 ns        14950 ns        48756
Subtraction<32>/1000                    14587 ns        14587 ns        47056
Subtraction<200>/160                    47175 ns        47175 ns        15574
SubtractionScalar<3>/1000                2572 ns         2572 ns       264468
SubtractionScalar<10>/1000               2713 ns         2713 ns       257920
SubtractionScalar<15>/1000               2621 ns         2621 ns       265289
SubtractionScalar<25>/1000               3593 ns         3593 ns       192266
SubtractionScalar<32>/1000               4255 ns         4255 ns       163738
SubtractionScalar<200>/160              19906 ns        19906 ns        35295
Multiplication<3>/1000                   6058 ns         6058 ns       114067
Multiplication<10>/1000                 11999 ns        11999 ns        58492
Multiplication<15>/1000                 17906 ns        17905 ns        39565
Multiplication<25>/1000                 27361 ns        27360 ns        25335
Multiplication<32>/1000                 33074 ns        33074 ns        20875
Multiplication<200>/160                 61364 ns        61362 ns        11542
MultiplicationLeftScalar<3>/1000         3104 ns         3104 ns       223720
MultiplicationLeftScalar<10>/1000        4549 ns         4549 ns       154366
MultiplicationLeftScalar<15>/1000        5921 ns         5921 ns       119294
MultiplicationLeftScalar<25>/1000       11429 ns        11428 ns        61685
MultiplicationLeftScalar<32>/1000       14094 ns        14094 ns        49941
MultiplicationLeftScalar<200>/160       28186 ns        28185 ns        24484
MultiplicationRightScalar<3>/1000        3110 ns         3110 ns       223333
MultiplicationRightScalar<10>/1000       4655 ns         4655 ns       150534
MultiplicationRightScalar<15>/1000       5890 ns         5890 ns       119746
MultiplicationRightScalar<25>/1000      11464 ns        11464 ns        61483
MultiplicationRightScalar<32>/1000      14243 ns        14242 ns        49492
MultiplicationRightScalar<200>/160      28282 ns        28281 ns        24604
Division<3>/1000                         9128 ns         9128 ns        77846
Division<10>/1000                       14811 ns        14811 ns        47682
Division<15>/1000                       23293 ns        23292 ns        30091
Division<25>/1000                       37313 ns        37313 ns        18608
Division<32>/1000                       41229 ns        41229 ns        16982
Division<200>/160                       44802 ns        44802 ns        15573
DivisionLeftScalar<3>/1000               6720 ns         6720 ns       104747
DivisionLeftScalar<10>/1000              9403 ns         9402 ns        75216
DivisionLeftScalar<15>/1000             12313 ns        12313 ns        57366
DivisionLeftScalar<25>/1000             22739 ns        22739 ns        30421
DivisionLeftScalar<32>/1000             20321 ns        20321 ns        34191
DivisionLeftScalar<200>/160             29018 ns        29017 ns        23908
DivisionRightScalar<3>/1000              3815 ns         3815 ns       182333
DivisionRightScalar<10>/1000             5750 ns         5750 ns       121691
DivisionRightScalar<15>/1000             7574 ns         7574 ns        92994
DivisionRightScalar<25>/1000            13953 ns        13953 ns        49250
DivisionRightScalar<32>/1000            16892 ns        16892 ns        41668
DivisionRightScalar<200>/160            28663 ns        28662 ns        24226
MultiplyAndAdd<3>/1000                   4399 ns         4399 ns       158635
MultiplyAndAdd<10>/1000                 10453 ns        10453 ns        68112
MultiplyAndAdd<15>/1000                 11830 ns        11830 ns        59598
MultiplyAndAdd<25>/1000                 19624 ns        19624 ns        36240
MultiplyAndAdd<32>/1000                 25539 ns        25538 ns        29066
MultiplyAndAdd<200>/160                 65362 ns        65358 ns        11086

Change-Id: Ie62492b3fd19ff9d3394f90bd00f0aa01522fc2a
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 140b250..80f8bdc 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -544,6 +544,9 @@
   add_executable(schur_eliminator_benchmark schur_eliminator_benchmark.cc)
   add_dependencies_to_benchmark(schur_eliminator_benchmark)
 
+  add_executable(jet_operator_benchmark jet_operator_benchmark.cc)
+  add_dependencies_to_benchmark(jet_operator_benchmark)
+
   add_subdirectory(autodiff_benchmarks)
 endif (BUILD_BENCHMARKS)
 
diff --git a/internal/ceres/jet_operator_benchmark.cc b/internal/ceres/jet_operator_benchmark.cc
new file mode 100644
index 0000000..5701556
--- /dev/null
+++ b/internal/ceres/jet_operator_benchmark.cc
@@ -0,0 +1,289 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2021 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: alex@karatarakis.com (Alexander Karatarakis)
+
+#include <array>
+
+#include "benchmark/benchmark.h"
+#include "ceres/jet.h"
+
+namespace ceres {
+
+// Cycle the Jets to avoid caching effects in the benchmark.
+template <class JetType>
+class JetInputData {
+  using T = typename JetType::Scalar;
+  static constexpr std::size_t SIZE = 20;
+
+ public:
+  JetInputData() : index_{0}, a_{}, b_{}, c_{}, d_{}, e_{} {
+    for (int i = 0; i < static_cast<int>(SIZE); i++) {
+      const T ti = static_cast<T>(i + 1);
+
+      a_[i].a = T(1.1) * ti;
+      a_[i].v.setRandom();
+
+      b_[i].a = T(2.2) * ti;
+      b_[i].v.setRandom();
+
+      c_[i].a = T(3.3) * ti;
+      c_[i].v.setRandom();
+
+      d_[i].a = T(4.4) * ti;
+      d_[i].v.setRandom();
+
+      e_[i].a = T(5.5) * ti;
+      e_[i].v.setRandom();
+
+      scalar_a_[i] = T(1.1) * ti;
+      scalar_b_[i] = T(2.2) * ti;
+      scalar_c_[i] = T(3.3) * ti;
+      scalar_d_[i] = T(4.4) * ti;
+      scalar_e_[i] = T(5.5) * ti;
+    }
+  }
+
+  void advance() { index_ = (index_ + 1) % SIZE; }
+
+  const JetType& a() const { return a_[index_]; }
+  const JetType& b() const { return b_[index_]; }
+  const JetType& c() const { return c_[index_]; }
+  const JetType& d() const { return d_[index_]; }
+  const JetType& e() const { return e_[index_]; }
+  T scalar_a() const { return scalar_a_[index_]; }
+  T scalar_b() const { return scalar_b_[index_]; }
+  T scalar_c() const { return scalar_c_[index_]; }
+  T scalar_d() const { return scalar_d_[index_]; }
+  T scalar_e() const { return scalar_e_[index_]; }
+
+ private:
+  std::size_t index_;
+  std::array<JetType, SIZE> a_;
+  std::array<JetType, SIZE> b_;
+  std::array<JetType, SIZE> c_;
+  std::array<JetType, SIZE> d_;
+  std::array<JetType, SIZE> e_;
+  std::array<T, SIZE> scalar_a_;
+  std::array<T, SIZE> scalar_b_;
+  std::array<T, SIZE> scalar_c_;
+  std::array<T, SIZE> scalar_d_;
+  std::array<T, SIZE> scalar_e_;
+};
+
+template <std::size_t JET_SIZE, class Function>
+static void JetBenchmarkHelper(benchmark::State& state, const Function& func) {
+  using JetType = Jet<double, JET_SIZE>;
+  JetInputData<JetType> data{};
+  JetType out{};
+  const int iterations = static_cast<int>(state.range(0));
+  for (auto _ : state) {
+    for (int i = 0; i < iterations; i++) {
+      func(data, out);
+      data.advance();
+    }
+  }
+  benchmark::DoNotOptimize(out);
+}
+
+template <std::size_t JET_SIZE>
+static void Addition(benchmark::State& state) {
+  using JetType = Jet<double, JET_SIZE>;
+  JetBenchmarkHelper<JET_SIZE>(
+      state, [](const JetInputData<JetType>& d, JetType& out) {
+        out += +d.a() + d.b() + d.c() + d.d() + d.e();
+      });
+}
+BENCHMARK_TEMPLATE(Addition, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(Addition, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(Addition, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(Addition, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(Addition, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(Addition, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void AdditionScalar(benchmark::State& state) {
+  using JetType = Jet<double, JET_SIZE>;
+  JetBenchmarkHelper<JET_SIZE>(
+      state, [](const JetInputData<JetType>& d, JetType& out) {
+        out +=
+            d.scalar_a() + d.scalar_b() + d.c() + d.scalar_d() + d.scalar_e();
+      });
+}
+BENCHMARK_TEMPLATE(AdditionScalar, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(AdditionScalar, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(AdditionScalar, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(AdditionScalar, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(AdditionScalar, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(AdditionScalar, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void Subtraction(benchmark::State& state) {
+  using JetType = Jet<double, JET_SIZE>;
+  JetBenchmarkHelper<JET_SIZE>(
+      state, [](const JetInputData<JetType>& d, JetType& out) {
+        out -= -d.a() - d.b() - d.c() - d.d() - d.e();
+      });
+}
+BENCHMARK_TEMPLATE(Subtraction, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(Subtraction, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(Subtraction, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(Subtraction, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(Subtraction, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(Subtraction, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void SubtractionScalar(benchmark::State& state) {
+  using JetType = Jet<double, JET_SIZE>;
+  JetBenchmarkHelper<JET_SIZE>(
+      state, [](const JetInputData<JetType>& d, JetType& out) {
+        out -=
+            -d.scalar_a() - d.scalar_b() - d.c() - d.scalar_d() - d.scalar_e();
+      });
+}
+BENCHMARK_TEMPLATE(SubtractionScalar, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(SubtractionScalar, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(SubtractionScalar, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(SubtractionScalar, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(SubtractionScalar, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(SubtractionScalar, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void Multiplication(benchmark::State& state) {
+  using JetType = Jet<double, JET_SIZE>;
+  JetBenchmarkHelper<JET_SIZE>(
+      state, [](const JetInputData<JetType>& d, JetType& out) {
+        out *= d.a() * d.b() * d.c() * d.d() * d.e();
+      });
+}
+BENCHMARK_TEMPLATE(Multiplication, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(Multiplication, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(Multiplication, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(Multiplication, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(Multiplication, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(Multiplication, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void MultiplicationLeftScalar(benchmark::State& state) {
+  using JetType = Jet<double, JET_SIZE>;
+  JetBenchmarkHelper<JET_SIZE>(
+      state, [](const JetInputData<JetType>& d, JetType& out) {
+        out += d.scalar_a() *
+               (d.scalar_b() * (d.scalar_c() * (d.scalar_d() * d.e())));
+      });
+}
+BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void MultiplicationRightScalar(benchmark::State& state) {
+  using JetType = Jet<double, JET_SIZE>;
+  JetBenchmarkHelper<JET_SIZE>(
+      state, [](const JetInputData<JetType>& d, JetType& out) {
+        out += (((d.a() * d.scalar_b()) * d.scalar_c()) * d.scalar_d()) *
+               d.scalar_e();
+      });
+}
+BENCHMARK_TEMPLATE(MultiplicationRightScalar, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationRightScalar, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationRightScalar, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationRightScalar, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationRightScalar, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationRightScalar, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void Division(benchmark::State& state) {
+  using JetType = Jet<double, JET_SIZE>;
+  JetBenchmarkHelper<JET_SIZE>(
+      state, [](const JetInputData<JetType>& d, JetType& out) {
+        out /= d.a() / d.b() / d.c() / d.d() / d.e();
+      });
+}
+BENCHMARK_TEMPLATE(Division, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(Division, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(Division, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(Division, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(Division, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(Division, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void DivisionLeftScalar(benchmark::State& state) {
+  using JetType = Jet<double, JET_SIZE>;
+  JetBenchmarkHelper<JET_SIZE>(
+      state, [](const JetInputData<JetType>& d, JetType& out) {
+        out += d.scalar_a() /
+               (d.scalar_b() / (d.scalar_c() / (d.scalar_d() / d.e())));
+      });
+}
+BENCHMARK_TEMPLATE(DivisionLeftScalar, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionLeftScalar, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionLeftScalar, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionLeftScalar, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionLeftScalar, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionLeftScalar, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void DivisionRightScalar(benchmark::State& state) {
+  using JetType = Jet<double, JET_SIZE>;
+  JetBenchmarkHelper<JET_SIZE>(
+      state, [](const JetInputData<JetType>& d, JetType& out) {
+        out += (((d.a() / d.scalar_b()) / d.scalar_c()) / d.scalar_d()) /
+               d.scalar_e();
+      });
+}
+BENCHMARK_TEMPLATE(DivisionRightScalar, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionRightScalar, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionRightScalar, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionRightScalar, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionRightScalar, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionRightScalar, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void MultiplyAndAdd(benchmark::State& state) {
+  using JetType = Jet<double, JET_SIZE>;
+  JetBenchmarkHelper<JET_SIZE>(
+      state, [](const JetInputData<JetType>& d, JetType& out) {
+        out += d.scalar_a() * d.a() + d.scalar_b() * d.b() +
+               d.scalar_c() * d.c() + d.scalar_d() * d.d() +
+               d.scalar_e() * d.e();
+      });
+}
+BENCHMARK_TEMPLATE(MultiplyAndAdd, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplyAndAdd, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplyAndAdd, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplyAndAdd, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplyAndAdd, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplyAndAdd, 200)->Arg(160);
+
+}  // namespace ceres
+
+BENCHMARK_MAIN();