Introduce benchmark for Jet operations Run on (20 X 4300 MHz CPU s) CPU Caches: L1 Data 32 KiB (x10) L1 Instruction 32 KiB (x10) L2 Unified 1024 KiB (x10) L3 Unified 14080 KiB (x1) Load Average: 2.37, 3.00, 3.08 ----------------------------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------------------------- Addition<3>/1000 2872 ns 2872 ns 240449 Addition<10>/1000 5304 ns 5304 ns 100000 Addition<15>/1000 8211 ns 8210 ns 78742 Addition<25>/1000 14214 ns 14213 ns 46762 Addition<32>/1000 13746 ns 13746 ns 50892 Addition<200>/160 41228 ns 41228 ns 17183 AdditionScalar<3>/1000 2514 ns 2514 ns 273996 AdditionScalar<10>/1000 2733 ns 2733 ns 255508 AdditionScalar<15>/1000 2622 ns 2622 ns 264291 AdditionScalar<25>/1000 3753 ns 3753 ns 183508 AdditionScalar<32>/1000 4254 ns 4254 ns 167016 AdditionScalar<200>/160 18314 ns 18314 ns 38116 Subtraction<3>/1000 3241 ns 3241 ns 206370 Subtraction<10>/1000 5023 ns 5023 ns 139271 Subtraction<15>/1000 8387 ns 8386 ns 89927 Subtraction<25>/1000 14951 ns 14950 ns 48756 Subtraction<32>/1000 14587 ns 14587 ns 47056 Subtraction<200>/160 47175 ns 47175 ns 15574 SubtractionScalar<3>/1000 2572 ns 2572 ns 264468 SubtractionScalar<10>/1000 2713 ns 2713 ns 257920 SubtractionScalar<15>/1000 2621 ns 2621 ns 265289 SubtractionScalar<25>/1000 3593 ns 3593 ns 192266 SubtractionScalar<32>/1000 4255 ns 4255 ns 163738 SubtractionScalar<200>/160 19906 ns 19906 ns 35295 Multiplication<3>/1000 6058 ns 6058 ns 114067 Multiplication<10>/1000 11999 ns 11999 ns 58492 Multiplication<15>/1000 17906 ns 17905 ns 39565 Multiplication<25>/1000 27361 ns 27360 ns 25335 Multiplication<32>/1000 33074 ns 33074 ns 20875 Multiplication<200>/160 61364 ns 61362 ns 11542 MultiplicationLeftScalar<3>/1000 3104 ns 3104 ns 223720 MultiplicationLeftScalar<10>/1000 4549 ns 4549 ns 154366 MultiplicationLeftScalar<15>/1000 5921 ns 5921 ns 119294 MultiplicationLeftScalar<25>/1000 11429 ns 11428 ns 61685 MultiplicationLeftScalar<32>/1000 14094 ns 14094 ns 49941 MultiplicationLeftScalar<200>/160 28186 ns 28185 ns 24484 MultiplicationRightScalar<3>/1000 3110 ns 3110 ns 223333 MultiplicationRightScalar<10>/1000 4655 ns 4655 ns 150534 MultiplicationRightScalar<15>/1000 5890 ns 5890 ns 119746 MultiplicationRightScalar<25>/1000 11464 ns 11464 ns 61483 MultiplicationRightScalar<32>/1000 14243 ns 14242 ns 49492 MultiplicationRightScalar<200>/160 28282 ns 28281 ns 24604 Division<3>/1000 9128 ns 9128 ns 77846 Division<10>/1000 14811 ns 14811 ns 47682 Division<15>/1000 23293 ns 23292 ns 30091 Division<25>/1000 37313 ns 37313 ns 18608 Division<32>/1000 41229 ns 41229 ns 16982 Division<200>/160 44802 ns 44802 ns 15573 DivisionLeftScalar<3>/1000 6720 ns 6720 ns 104747 DivisionLeftScalar<10>/1000 9403 ns 9402 ns 75216 DivisionLeftScalar<15>/1000 12313 ns 12313 ns 57366 DivisionLeftScalar<25>/1000 22739 ns 22739 ns 30421 DivisionLeftScalar<32>/1000 20321 ns 20321 ns 34191 DivisionLeftScalar<200>/160 29018 ns 29017 ns 23908 DivisionRightScalar<3>/1000 3815 ns 3815 ns 182333 DivisionRightScalar<10>/1000 5750 ns 5750 ns 121691 DivisionRightScalar<15>/1000 7574 ns 7574 ns 92994 DivisionRightScalar<25>/1000 13953 ns 13953 ns 49250 DivisionRightScalar<32>/1000 16892 ns 16892 ns 41668 DivisionRightScalar<200>/160 28663 ns 28662 ns 24226 MultiplyAndAdd<3>/1000 4399 ns 4399 ns 158635 MultiplyAndAdd<10>/1000 10453 ns 10453 ns 68112 MultiplyAndAdd<15>/1000 11830 ns 11830 ns 59598 MultiplyAndAdd<25>/1000 19624 ns 19624 ns 36240 MultiplyAndAdd<32>/1000 25539 ns 25538 ns 29066 MultiplyAndAdd<200>/160 65362 ns 65358 ns 11086 Change-Id: Ie62492b3fd19ff9d3394f90bd00f0aa01522fc2a
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt index 140b250..80f8bdc 100644 --- a/internal/ceres/CMakeLists.txt +++ b/internal/ceres/CMakeLists.txt
@@ -544,6 +544,9 @@ add_executable(schur_eliminator_benchmark schur_eliminator_benchmark.cc) add_dependencies_to_benchmark(schur_eliminator_benchmark) + add_executable(jet_operator_benchmark jet_operator_benchmark.cc) + add_dependencies_to_benchmark(jet_operator_benchmark) + add_subdirectory(autodiff_benchmarks) endif (BUILD_BENCHMARKS)
diff --git a/internal/ceres/jet_operator_benchmark.cc b/internal/ceres/jet_operator_benchmark.cc new file mode 100644 index 0000000..5701556 --- /dev/null +++ b/internal/ceres/jet_operator_benchmark.cc
@@ -0,0 +1,289 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2021 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Author: alex@karatarakis.com (Alexander Karatarakis) + +#include <array> + +#include "benchmark/benchmark.h" +#include "ceres/jet.h" + +namespace ceres { + +// Cycle the Jets to avoid caching effects in the benchmark. +template <class JetType> +class JetInputData { + using T = typename JetType::Scalar; + static constexpr std::size_t SIZE = 20; + + public: + JetInputData() : index_{0}, a_{}, b_{}, c_{}, d_{}, e_{} { + for (int i = 0; i < static_cast<int>(SIZE); i++) { + const T ti = static_cast<T>(i + 1); + + a_[i].a = T(1.1) * ti; + a_[i].v.setRandom(); + + b_[i].a = T(2.2) * ti; + b_[i].v.setRandom(); + + c_[i].a = T(3.3) * ti; + c_[i].v.setRandom(); + + d_[i].a = T(4.4) * ti; + d_[i].v.setRandom(); + + e_[i].a = T(5.5) * ti; + e_[i].v.setRandom(); + + scalar_a_[i] = T(1.1) * ti; + scalar_b_[i] = T(2.2) * ti; + scalar_c_[i] = T(3.3) * ti; + scalar_d_[i] = T(4.4) * ti; + scalar_e_[i] = T(5.5) * ti; + } + } + + void advance() { index_ = (index_ + 1) % SIZE; } + + const JetType& a() const { return a_[index_]; } + const JetType& b() const { return b_[index_]; } + const JetType& c() const { return c_[index_]; } + const JetType& d() const { return d_[index_]; } + const JetType& e() const { return e_[index_]; } + T scalar_a() const { return scalar_a_[index_]; } + T scalar_b() const { return scalar_b_[index_]; } + T scalar_c() const { return scalar_c_[index_]; } + T scalar_d() const { return scalar_d_[index_]; } + T scalar_e() const { return scalar_e_[index_]; } + + private: + std::size_t index_; + std::array<JetType, SIZE> a_; + std::array<JetType, SIZE> b_; + std::array<JetType, SIZE> c_; + std::array<JetType, SIZE> d_; + std::array<JetType, SIZE> e_; + std::array<T, SIZE> scalar_a_; + std::array<T, SIZE> scalar_b_; + std::array<T, SIZE> scalar_c_; + std::array<T, SIZE> scalar_d_; + std::array<T, SIZE> scalar_e_; +}; + +template <std::size_t JET_SIZE, class Function> +static void JetBenchmarkHelper(benchmark::State& state, const Function& func) { + using JetType = Jet<double, JET_SIZE>; + JetInputData<JetType> data{}; + JetType out{}; + const int iterations = static_cast<int>(state.range(0)); + for (auto _ : state) { + for (int i = 0; i < iterations; i++) { + func(data, out); + data.advance(); + } + } + benchmark::DoNotOptimize(out); +} + +template <std::size_t JET_SIZE> +static void Addition(benchmark::State& state) { + using JetType = Jet<double, JET_SIZE>; + JetBenchmarkHelper<JET_SIZE>( + state, [](const JetInputData<JetType>& d, JetType& out) { + out += +d.a() + d.b() + d.c() + d.d() + d.e(); + }); +} +BENCHMARK_TEMPLATE(Addition, 3)->Arg(1000); +BENCHMARK_TEMPLATE(Addition, 10)->Arg(1000); +BENCHMARK_TEMPLATE(Addition, 15)->Arg(1000); +BENCHMARK_TEMPLATE(Addition, 25)->Arg(1000); +BENCHMARK_TEMPLATE(Addition, 32)->Arg(1000); +BENCHMARK_TEMPLATE(Addition, 200)->Arg(160); + +template <std::size_t JET_SIZE> +static void AdditionScalar(benchmark::State& state) { + using JetType = Jet<double, JET_SIZE>; + JetBenchmarkHelper<JET_SIZE>( + state, [](const JetInputData<JetType>& d, JetType& out) { + out += + d.scalar_a() + d.scalar_b() + d.c() + d.scalar_d() + d.scalar_e(); + }); +} +BENCHMARK_TEMPLATE(AdditionScalar, 3)->Arg(1000); +BENCHMARK_TEMPLATE(AdditionScalar, 10)->Arg(1000); +BENCHMARK_TEMPLATE(AdditionScalar, 15)->Arg(1000); +BENCHMARK_TEMPLATE(AdditionScalar, 25)->Arg(1000); +BENCHMARK_TEMPLATE(AdditionScalar, 32)->Arg(1000); +BENCHMARK_TEMPLATE(AdditionScalar, 200)->Arg(160); + +template <std::size_t JET_SIZE> +static void Subtraction(benchmark::State& state) { + using JetType = Jet<double, JET_SIZE>; + JetBenchmarkHelper<JET_SIZE>( + state, [](const JetInputData<JetType>& d, JetType& out) { + out -= -d.a() - d.b() - d.c() - d.d() - d.e(); + }); +} +BENCHMARK_TEMPLATE(Subtraction, 3)->Arg(1000); +BENCHMARK_TEMPLATE(Subtraction, 10)->Arg(1000); +BENCHMARK_TEMPLATE(Subtraction, 15)->Arg(1000); +BENCHMARK_TEMPLATE(Subtraction, 25)->Arg(1000); +BENCHMARK_TEMPLATE(Subtraction, 32)->Arg(1000); +BENCHMARK_TEMPLATE(Subtraction, 200)->Arg(160); + +template <std::size_t JET_SIZE> +static void SubtractionScalar(benchmark::State& state) { + using JetType = Jet<double, JET_SIZE>; + JetBenchmarkHelper<JET_SIZE>( + state, [](const JetInputData<JetType>& d, JetType& out) { + out -= + -d.scalar_a() - d.scalar_b() - d.c() - d.scalar_d() - d.scalar_e(); + }); +} +BENCHMARK_TEMPLATE(SubtractionScalar, 3)->Arg(1000); +BENCHMARK_TEMPLATE(SubtractionScalar, 10)->Arg(1000); +BENCHMARK_TEMPLATE(SubtractionScalar, 15)->Arg(1000); +BENCHMARK_TEMPLATE(SubtractionScalar, 25)->Arg(1000); +BENCHMARK_TEMPLATE(SubtractionScalar, 32)->Arg(1000); +BENCHMARK_TEMPLATE(SubtractionScalar, 200)->Arg(160); + +template <std::size_t JET_SIZE> +static void Multiplication(benchmark::State& state) { + using JetType = Jet<double, JET_SIZE>; + JetBenchmarkHelper<JET_SIZE>( + state, [](const JetInputData<JetType>& d, JetType& out) { + out *= d.a() * d.b() * d.c() * d.d() * d.e(); + }); +} +BENCHMARK_TEMPLATE(Multiplication, 3)->Arg(1000); +BENCHMARK_TEMPLATE(Multiplication, 10)->Arg(1000); +BENCHMARK_TEMPLATE(Multiplication, 15)->Arg(1000); +BENCHMARK_TEMPLATE(Multiplication, 25)->Arg(1000); +BENCHMARK_TEMPLATE(Multiplication, 32)->Arg(1000); +BENCHMARK_TEMPLATE(Multiplication, 200)->Arg(160); + +template <std::size_t JET_SIZE> +static void MultiplicationLeftScalar(benchmark::State& state) { + using JetType = Jet<double, JET_SIZE>; + JetBenchmarkHelper<JET_SIZE>( + state, [](const JetInputData<JetType>& d, JetType& out) { + out += d.scalar_a() * + (d.scalar_b() * (d.scalar_c() * (d.scalar_d() * d.e()))); + }); +} +BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 3)->Arg(1000); +BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 10)->Arg(1000); +BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 15)->Arg(1000); +BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 25)->Arg(1000); +BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 32)->Arg(1000); +BENCHMARK_TEMPLATE(MultiplicationLeftScalar, 200)->Arg(160); + +template <std::size_t JET_SIZE> +static void MultiplicationRightScalar(benchmark::State& state) { + using JetType = Jet<double, JET_SIZE>; + JetBenchmarkHelper<JET_SIZE>( + state, [](const JetInputData<JetType>& d, JetType& out) { + out += (((d.a() * d.scalar_b()) * d.scalar_c()) * d.scalar_d()) * + d.scalar_e(); + }); +} +BENCHMARK_TEMPLATE(MultiplicationRightScalar, 3)->Arg(1000); +BENCHMARK_TEMPLATE(MultiplicationRightScalar, 10)->Arg(1000); +BENCHMARK_TEMPLATE(MultiplicationRightScalar, 15)->Arg(1000); +BENCHMARK_TEMPLATE(MultiplicationRightScalar, 25)->Arg(1000); +BENCHMARK_TEMPLATE(MultiplicationRightScalar, 32)->Arg(1000); +BENCHMARK_TEMPLATE(MultiplicationRightScalar, 200)->Arg(160); + +template <std::size_t JET_SIZE> +static void Division(benchmark::State& state) { + using JetType = Jet<double, JET_SIZE>; + JetBenchmarkHelper<JET_SIZE>( + state, [](const JetInputData<JetType>& d, JetType& out) { + out /= d.a() / d.b() / d.c() / d.d() / d.e(); + }); +} +BENCHMARK_TEMPLATE(Division, 3)->Arg(1000); +BENCHMARK_TEMPLATE(Division, 10)->Arg(1000); +BENCHMARK_TEMPLATE(Division, 15)->Arg(1000); +BENCHMARK_TEMPLATE(Division, 25)->Arg(1000); +BENCHMARK_TEMPLATE(Division, 32)->Arg(1000); +BENCHMARK_TEMPLATE(Division, 200)->Arg(160); + +template <std::size_t JET_SIZE> +static void DivisionLeftScalar(benchmark::State& state) { + using JetType = Jet<double, JET_SIZE>; + JetBenchmarkHelper<JET_SIZE>( + state, [](const JetInputData<JetType>& d, JetType& out) { + out += d.scalar_a() / + (d.scalar_b() / (d.scalar_c() / (d.scalar_d() / d.e()))); + }); +} +BENCHMARK_TEMPLATE(DivisionLeftScalar, 3)->Arg(1000); +BENCHMARK_TEMPLATE(DivisionLeftScalar, 10)->Arg(1000); +BENCHMARK_TEMPLATE(DivisionLeftScalar, 15)->Arg(1000); +BENCHMARK_TEMPLATE(DivisionLeftScalar, 25)->Arg(1000); +BENCHMARK_TEMPLATE(DivisionLeftScalar, 32)->Arg(1000); +BENCHMARK_TEMPLATE(DivisionLeftScalar, 200)->Arg(160); + +template <std::size_t JET_SIZE> +static void DivisionRightScalar(benchmark::State& state) { + using JetType = Jet<double, JET_SIZE>; + JetBenchmarkHelper<JET_SIZE>( + state, [](const JetInputData<JetType>& d, JetType& out) { + out += (((d.a() / d.scalar_b()) / d.scalar_c()) / d.scalar_d()) / + d.scalar_e(); + }); +} +BENCHMARK_TEMPLATE(DivisionRightScalar, 3)->Arg(1000); +BENCHMARK_TEMPLATE(DivisionRightScalar, 10)->Arg(1000); +BENCHMARK_TEMPLATE(DivisionRightScalar, 15)->Arg(1000); +BENCHMARK_TEMPLATE(DivisionRightScalar, 25)->Arg(1000); +BENCHMARK_TEMPLATE(DivisionRightScalar, 32)->Arg(1000); +BENCHMARK_TEMPLATE(DivisionRightScalar, 200)->Arg(160); + +template <std::size_t JET_SIZE> +static void MultiplyAndAdd(benchmark::State& state) { + using JetType = Jet<double, JET_SIZE>; + JetBenchmarkHelper<JET_SIZE>( + state, [](const JetInputData<JetType>& d, JetType& out) { + out += d.scalar_a() * d.a() + d.scalar_b() * d.b() + + d.scalar_c() * d.c() + d.scalar_d() * d.d() + + d.scalar_e() * d.e(); + }); +} +BENCHMARK_TEMPLATE(MultiplyAndAdd, 3)->Arg(1000); +BENCHMARK_TEMPLATE(MultiplyAndAdd, 10)->Arg(1000); +BENCHMARK_TEMPLATE(MultiplyAndAdd, 15)->Arg(1000); +BENCHMARK_TEMPLATE(MultiplyAndAdd, 25)->Arg(1000); +BENCHMARK_TEMPLATE(MultiplyAndAdd, 32)->Arg(1000); +BENCHMARK_TEMPLATE(MultiplyAndAdd, 200)->Arg(160); + +} // namespace ceres + +BENCHMARK_MAIN();