Introduce benchmark for Jet operations
Run on (20 X 4300 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x10)
L1 Instruction 32 KiB (x10)
L2 Unified 1024 KiB (x10)
L3 Unified 14080 KiB (x1)
Load Average: 2.37, 3.00, 3.08
-----------------------------------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------------------------------
Addition<3>/1000 2872 ns 2872 ns 240449
Addition<10>/1000 5304 ns 5304 ns 100000
Addition<15>/1000 8211 ns 8210 ns 78742
Addition<25>/1000 14214 ns 14213 ns 46762
Addition<32>/1000 13746 ns 13746 ns 50892
Addition<200>/160 41228 ns 41228 ns 17183
AdditionScalar<3>/1000 2514 ns 2514 ns 273996
AdditionScalar<10>/1000 2733 ns 2733 ns 255508
AdditionScalar<15>/1000 2622 ns 2622 ns 264291
AdditionScalar<25>/1000 3753 ns 3753 ns 183508
AdditionScalar<32>/1000 4254 ns 4254 ns 167016
AdditionScalar<200>/160 18314 ns 18314 ns 38116
Subtraction<3>/1000 3241 ns 3241 ns 206370
Subtraction<10>/1000 5023 ns 5023 ns 139271
Subtraction<15>/1000 8387 ns 8386 ns 89927
Subtraction<25>/1000 14951 ns 14950 ns 48756
Subtraction<32>/1000 14587 ns 14587 ns 47056
Subtraction<200>/160 47175 ns 47175 ns 15574
SubtractionScalar<3>/1000 2572 ns 2572 ns 264468
SubtractionScalar<10>/1000 2713 ns 2713 ns 257920
SubtractionScalar<15>/1000 2621 ns 2621 ns 265289
SubtractionScalar<25>/1000 3593 ns 3593 ns 192266
SubtractionScalar<32>/1000 4255 ns 4255 ns 163738
SubtractionScalar<200>/160 19906 ns 19906 ns 35295
Multiplication<3>/1000 6058 ns 6058 ns 114067
Multiplication<10>/1000 11999 ns 11999 ns 58492
Multiplication<15>/1000 17906 ns 17905 ns 39565
Multiplication<25>/1000 27361 ns 27360 ns 25335
Multiplication<32>/1000 33074 ns 33074 ns 20875
Multiplication<200>/160 61364 ns 61362 ns 11542
MultiplicationLeftScalar<3>/1000 3104 ns 3104 ns 223720
MultiplicationLeftScalar<10>/1000 4549 ns 4549 ns 154366
MultiplicationLeftScalar<15>/1000 5921 ns 5921 ns 119294
MultiplicationLeftScalar<25>/1000 11429 ns 11428 ns 61685
MultiplicationLeftScalar<32>/1000 14094 ns 14094 ns 49941
MultiplicationLeftScalar<200>/160 28186 ns 28185 ns 24484
MultiplicationRightScalar<3>/1000 3110 ns 3110 ns 223333
MultiplicationRightScalar<10>/1000 4655 ns 4655 ns 150534
MultiplicationRightScalar<15>/1000 5890 ns 5890 ns 119746
MultiplicationRightScalar<25>/1000 11464 ns 11464 ns 61483
MultiplicationRightScalar<32>/1000 14243 ns 14242 ns 49492
MultiplicationRightScalar<200>/160 28282 ns 28281 ns 24604
Division<3>/1000 9128 ns 9128 ns 77846
Division<10>/1000 14811 ns 14811 ns 47682
Division<15>/1000 23293 ns 23292 ns 30091
Division<25>/1000 37313 ns 37313 ns 18608
Division<32>/1000 41229 ns 41229 ns 16982
Division<200>/160 44802 ns 44802 ns 15573
DivisionLeftScalar<3>/1000 6720 ns 6720 ns 104747
DivisionLeftScalar<10>/1000 9403 ns 9402 ns 75216
DivisionLeftScalar<15>/1000 12313 ns 12313 ns 57366
DivisionLeftScalar<25>/1000 22739 ns 22739 ns 30421
DivisionLeftScalar<32>/1000 20321 ns 20321 ns 34191
DivisionLeftScalar<200>/160 29018 ns 29017 ns 23908
DivisionRightScalar<3>/1000 3815 ns 3815 ns 182333
DivisionRightScalar<10>/1000 5750 ns 5750 ns 121691
DivisionRightScalar<15>/1000 7574 ns 7574 ns 92994
DivisionRightScalar<25>/1000 13953 ns 13953 ns 49250
DivisionRightScalar<32>/1000 16892 ns 16892 ns 41668
DivisionRightScalar<200>/160 28663 ns 28662 ns 24226
MultiplyAndAdd<3>/1000 4399 ns 4399 ns 158635
MultiplyAndAdd<10>/1000 10453 ns 10453 ns 68112
MultiplyAndAdd<15>/1000 11830 ns 11830 ns 59598
MultiplyAndAdd<25>/1000 19624 ns 19624 ns 36240
MultiplyAndAdd<32>/1000 25539 ns 25538 ns 29066
MultiplyAndAdd<200>/160 65362 ns 65358 ns 11086
Change-Id: Ie62492b3fd19ff9d3394f90bd00f0aa01522fc2a
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 140b250..80f8bdc 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -544,6 +544,9 @@
add_executable(schur_eliminator_benchmark schur_eliminator_benchmark.cc)
add_dependencies_to_benchmark(schur_eliminator_benchmark)
+ add_executable(jet_operator_benchmark jet_operator_benchmark.cc)
+ add_dependencies_to_benchmark(jet_operator_benchmark)
+
add_subdirectory(autodiff_benchmarks)
endif (BUILD_BENCHMARKS)
diff --git a/internal/ceres/jet_operator_benchmark.cc b/internal/ceres/jet_operator_benchmark.cc
new file mode 100644
index 0000000..5701556
--- /dev/null
+++ b/internal/ceres/jet_operator_benchmark.cc
@@ -0,0 +1,289 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2021 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: alex@karatarakis.com (Alexander Karatarakis)
+
+#include <array>
+
+#include "benchmark/benchmark.h"
+#include "ceres/jet.h"
+
+namespace ceres {
+
+// Cycle the Jets to avoid caching effects in the benchmark.
+template <class JetType>
+class JetInputData {
+ using T = typename JetType::Scalar;
+ static constexpr std::size_t SIZE = 20;
+
+ public:
+ JetInputData() : index_{0}, a_{}, b_{}, c_{}, d_{}, e_{} {
+ for (int i = 0; i < static_cast<int>(SIZE); i++) {
+ const T ti = static_cast<T>(i + 1);
+
+ a_[i].a = T(1.1) * ti;
+ a_[i].v.setRandom();
+
+ b_[i].a = T(2.2) * ti;
+ b_[i].v.setRandom();
+
+ c_[i].a = T(3.3) * ti;
+ c_[i].v.setRandom();
+
+ d_[i].a = T(4.4) * ti;
+ d_[i].v.setRandom();
+
+ e_[i].a = T(5.5) * ti;
+ e_[i].v.setRandom();
+
+ scalar_a_[i] = T(1.1) * ti;
+ scalar_b_[i] = T(2.2) * ti;
+ scalar_c_[i] = T(3.3) * ti;
+ scalar_d_[i] = T(4.4) * ti;
+ scalar_e_[i] = T(5.5) * ti;
+ }
+ }
+
+ void advance() { index_ = (index_ + 1) % SIZE; }
+
+ const JetType& a() const { return a_[index_]; }
+ const JetType& b() const { return b_[index_]; }
+ const JetType& c() const { return c_[index_]; }
+ const JetType& d() const { return d_[index_]; }
+ const JetType& e() const { return e_[index_]; }
+ T scalar_a() const { return scalar_a_[index_]; }
+ T scalar_b() const { return scalar_b_[index_]; }
+ T scalar_c() const { return scalar_c_[index_]; }
+ T scalar_d() const { return scalar_d_[index_]; }
+ T scalar_e() const { return scalar_e_[index_]; }
+
+ private:
+ std::size_t index_;
+ std::array<JetType, SIZE> a_;
+ std::array<JetType, SIZE> b_;
+ std::array<JetType, SIZE> c_;
+ std::array<JetType, SIZE> d_;
+ std::array<JetType, SIZE> e_;
+ std::array<T, SIZE> scalar_a_;
+ std::array<T, SIZE> scalar_b_;
+ std::array<T, SIZE> scalar_c_;
+ std::array<T, SIZE> scalar_d_;
+ std::array<T, SIZE> scalar_e_;
+};
+
+template <std::size_t JET_SIZE, class Function>
+static void JetBenchmarkHelper(benchmark::State& state, const Function& func) {
+ using JetType = Jet<double, JET_SIZE>;
+ JetInputData<JetType> data{};
+ JetType out{};
+ const int iterations = static_cast<int>(state.range(0));
+ for (auto _ : state) {
+ for (int i = 0; i < iterations; i++) {
+ func(data, out);
+ data.advance();
+ }
+ }
+ benchmark::DoNotOptimize(out);
+}
+
+template <std::size_t JET_SIZE>
+static void Addition(benchmark::State& state) {
+ using JetType = Jet<double, JET_SIZE>;
+ JetBenchmarkHelper<JET_SIZE>(
+ state, [](const JetInputData<JetType>& d, JetType& out) {
+ out += +d.a() + d.b() + d.c() + d.d() + d.e();
+ });
+}
+BENCHMARK_TEMPLATE(Addition, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(Addition, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(Addition, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(Addition, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(Addition, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(Addition, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void AdditionScalar(benchmark::State& state) {
+ using JetType = Jet<double, JET_SIZE>;
+ JetBenchmarkHelper<JET_SIZE>(
+ state, [](const JetInputData<JetType>& d, JetType& out) {
+ out +=
+ d.scalar_a() + d.scalar_b() + d.c() + d.scalar_d() + d.scalar_e();
+ });
+}
+BENCHMARK_TEMPLATE(AdditionScalar, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(AdditionScalar, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(AdditionScalar, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(AdditionScalar, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(AdditionScalar, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(AdditionScalar, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void Subtraction(benchmark::State& state) {
+ using JetType = Jet<double, JET_SIZE>;
+ JetBenchmarkHelper<JET_SIZE>(
+ state, [](const JetInputData<JetType>& d, JetType& out) {
+ out -= -d.a() - d.b() - d.c() - d.d() - d.e();
+ });
+}
+BENCHMARK_TEMPLATE(Subtraction, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(Subtraction, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(Subtraction, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(Subtraction, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(Subtraction, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(Subtraction, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void SubtractionScalar(benchmark::State& state) {
+ using JetType = Jet<double, JET_SIZE>;
+ JetBenchmarkHelper<JET_SIZE>(
+ state, [](const JetInputData<JetType>& d, JetType& out) {
+ out -=
+ -d.scalar_a() - d.scalar_b() - d.c() - d.scalar_d() - d.scalar_e();
+ });
+}
+BENCHMARK_TEMPLATE(SubtractionScalar, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(SubtractionScalar, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(SubtractionScalar, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(SubtractionScalar, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(SubtractionScalar, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(SubtractionScalar, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void Multiplication(benchmark::State& state) {
+ using JetType = Jet<double, JET_SIZE>;
+ JetBenchmarkHelper<JET_SIZE>(
+ state, [](const JetInputData<JetType>& d, JetType& out) {
+ out *= d.a() * d.b() * d.c() * d.d() * d.e();
+ });
+}
+BENCHMARK_TEMPLATE(Multiplication, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(Multiplication, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(Multiplication, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(Multiplication, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(Multiplication, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(Multiplication, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void MultiplicationLeftScalar(benchmark::State& state) {
+ using JetType = Jet<double, JET_SIZE>;
+ JetBenchmarkHelper<JET_SIZE>(
+ state, [](const JetInputData<JetType>& d, JetType& out) {
+ out += d.scalar_a() *
+ (d.scalar_b() * (d.scalar_c() * (d.scalar_d() * d.e())));
+ });
+}
+BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void MultiplicationRightScalar(benchmark::State& state) {
+ using JetType = Jet<double, JET_SIZE>;
+ JetBenchmarkHelper<JET_SIZE>(
+ state, [](const JetInputData<JetType>& d, JetType& out) {
+ out += (((d.a() * d.scalar_b()) * d.scalar_c()) * d.scalar_d()) *
+ d.scalar_e();
+ });
+}
+BENCHMARK_TEMPLATE(MultiplicationRightScalar, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationRightScalar, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationRightScalar, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationRightScalar, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationRightScalar, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplicationRightScalar, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void Division(benchmark::State& state) {
+ using JetType = Jet<double, JET_SIZE>;
+ JetBenchmarkHelper<JET_SIZE>(
+ state, [](const JetInputData<JetType>& d, JetType& out) {
+ out /= d.a() / d.b() / d.c() / d.d() / d.e();
+ });
+}
+BENCHMARK_TEMPLATE(Division, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(Division, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(Division, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(Division, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(Division, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(Division, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void DivisionLeftScalar(benchmark::State& state) {
+ using JetType = Jet<double, JET_SIZE>;
+ JetBenchmarkHelper<JET_SIZE>(
+ state, [](const JetInputData<JetType>& d, JetType& out) {
+ out += d.scalar_a() /
+ (d.scalar_b() / (d.scalar_c() / (d.scalar_d() / d.e())));
+ });
+}
+BENCHMARK_TEMPLATE(DivisionLeftScalar, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionLeftScalar, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionLeftScalar, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionLeftScalar, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionLeftScalar, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionLeftScalar, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void DivisionRightScalar(benchmark::State& state) {
+ using JetType = Jet<double, JET_SIZE>;
+ JetBenchmarkHelper<JET_SIZE>(
+ state, [](const JetInputData<JetType>& d, JetType& out) {
+ out += (((d.a() / d.scalar_b()) / d.scalar_c()) / d.scalar_d()) /
+ d.scalar_e();
+ });
+}
+BENCHMARK_TEMPLATE(DivisionRightScalar, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionRightScalar, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionRightScalar, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionRightScalar, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionRightScalar, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(DivisionRightScalar, 200)->Arg(160);
+
+template <std::size_t JET_SIZE>
+static void MultiplyAndAdd(benchmark::State& state) {
+ using JetType = Jet<double, JET_SIZE>;
+ JetBenchmarkHelper<JET_SIZE>(
+ state, [](const JetInputData<JetType>& d, JetType& out) {
+ out += d.scalar_a() * d.a() + d.scalar_b() * d.b() +
+ d.scalar_c() * d.c() + d.scalar_d() * d.d() +
+ d.scalar_e() * d.e();
+ });
+}
+BENCHMARK_TEMPLATE(MultiplyAndAdd, 3)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplyAndAdd, 10)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplyAndAdd, 15)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplyAndAdd, 25)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplyAndAdd, 32)->Arg(1000);
+BENCHMARK_TEMPLATE(MultiplyAndAdd, 200)->Arg(160);
+
+} // namespace ceres
+
+BENCHMARK_MAIN();