Add a benchmark for small_blas.h This CL adds a benchmark for MatrixVectorMultiply and MatrixTransposeVectorMultiply. Change-Id: I50e1cec72c91dedf081b6f00681874abf545123b
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt index c5fb259..884fd57 100644 --- a/internal/ceres/CMakeLists.txt +++ b/internal/ceres/CMakeLists.txt
@@ -377,4 +377,8 @@ if (BUILD_BENCHMARKS) add_executable(autodiff_cost_function_benchmark autodiff_cost_function_benchmark.cc) target_link_libraries(autodiff_cost_function_benchmark ceres benchmark::benchmark) + + add_executable(small_blas_benchmark small_blas_benchmark.cc) + target_link_libraries(small_blas_benchmark ceres benchmark::benchmark) + endif (BUILD_BENCHMARKS)
diff --git a/internal/ceres/small_blas_benchmark.cc b/internal/ceres/small_blas_benchmark.cc new file mode 100644 index 0000000..c78a69d --- /dev/null +++ b/internal/ceres/small_blas_benchmark.cc
@@ -0,0 +1,296 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2018 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Authors: sameeragarwal@google.com (Sameer Agarwal) + +#include <iostream> +#include "Eigen/Dense" +#include "benchmark/benchmark.h" +#include "ceres/small_blas.h" + +namespace ceres { + +// Benchmarking matrix-vector multiply routines and optimizing memory +// access requires that we make sure that they are not just sitting in +// the cache. So, as the benchmarking routine iterates, we need to +// multiply new/different matrice and vectors. Allocating/creating +// these objects in the benchmarking loop is too heavy duty, so we +// create them before hand and cycle through them in the +// benchmark. This class, given the size of the matrix creates such +// matrix and vector objects for use in the benchmark. +class MatrixVectorMultiplyData { + public: + MatrixVectorMultiplyData(int rows, int cols) { + num_elements_ = 1000; + // A single memory buffer for all the matrices & vectors. + + size_t buffer_size = num_elements_ * (200); + data_.resize(buffer_size, 1.00000000000001); + + // Each element is three points, corresponding to the three + // elements of the expression c = A * b. + ptrs_.resize(3 * num_elements_, NULL); + double* p = &data_[0]; + for (int i = 0; i < num_elements_; ++i) { + // Matrix X. + ptrs_[3 * i] = p; + p += rows * cols; + // Vector b. + ptrs_[3 * i + 1] = p; + p += cols; + // Vector c. + ptrs_[3 * i + 2] = p; + p += rows; + } + } + + int num_elements() const { return num_elements_; } + double* data() { return &data_[0]; } + const std::vector<double*>& ptrs() const { return ptrs_; } + + private: + int num_elements_; + std::vector<double> data_; + std::vector<double*> ptrs_; +}; + +// Run on (8 X 2200 MHz CPU s) +// 2018-02-06 21:23:59 +// --------------------------------------------------------------------------- +// Benchmark Time CPU Iterations +// --------------------------------------------------------------------------- +// BM_MatrixVectorMultiplyDynamic/1/1 4 ns 4 ns 165611093 +// BM_MatrixVectorMultiplyDynamic/1/2 5 ns 5 ns 140648672 +// BM_MatrixVectorMultiplyDynamic/1/3 5 ns 5 ns 139414459 +// BM_MatrixVectorMultiplyDynamic/1/4 5 ns 5 ns 144247512 +// BM_MatrixVectorMultiplyDynamic/1/6 6 ns 6 ns 106639042 +// BM_MatrixVectorMultiplyDynamic/1/8 7 ns 7 ns 102367617 +// BM_MatrixVectorMultiplyDynamic/1/10 9 ns 9 ns 82419847 +// BM_MatrixVectorMultiplyDynamic/1/12 10 ns 10 ns 65129002 +// BM_MatrixVectorMultiplyDynamic/1/16 12 ns 12 ns 53500867 +// BM_MatrixVectorMultiplyDynamic/1/20 16 ns 16 ns 46067179 +// BM_MatrixVectorMultiplyDynamic/2/1 5 ns 5 ns 128880215 +// BM_MatrixVectorMultiplyDynamic/2/2 8 ns 8 ns 81938429 +// BM_MatrixVectorMultiplyDynamic/2/3 10 ns 10 ns 68807565 +// BM_MatrixVectorMultiplyDynamic/2/4 8 ns 8 ns 91833388 +// BM_MatrixVectorMultiplyDynamic/2/6 10 ns 10 ns 64031028 +// BM_MatrixVectorMultiplyDynamic/2/8 12 ns 12 ns 59788179 +// BM_MatrixVectorMultiplyDynamic/2/10 15 ns 15 ns 44737868 +// BM_MatrixVectorMultiplyDynamic/2/12 17 ns 17 ns 37423949 +// BM_MatrixVectorMultiplyDynamic/2/16 22 ns 22 ns 33470723 +// BM_MatrixVectorMultiplyDynamic/2/20 26 ns 26 ns 27076057 +// BM_MatrixVectorMultiplyDynamic/3/1 6 ns 6 ns 100932908 +// BM_MatrixVectorMultiplyDynamic/3/2 12 ns 12 ns 65591589 +// BM_MatrixVectorMultiplyDynamic/3/3 14 ns 14 ns 48182819 +// BM_MatrixVectorMultiplyDynamic/3/4 11 ns 11 ns 61770338 +// BM_MatrixVectorMultiplyDynamic/3/6 15 ns 15 ns 44712435 +// BM_MatrixVectorMultiplyDynamic/3/8 18 ns 18 ns 35177294 +// BM_MatrixVectorMultiplyDynamic/3/10 21 ns 21 ns 32164683 +// BM_MatrixVectorMultiplyDynamic/3/12 24 ns 24 ns 28222279 +// BM_MatrixVectorMultiplyDynamic/3/16 30 ns 30 ns 23050731 +// BM_MatrixVectorMultiplyDynamic/3/20 38 ns 38 ns 17832714 +// BM_MatrixVectorMultiplyDynamic/4/1 8 ns 8 ns 85763293 +// BM_MatrixVectorMultiplyDynamic/4/2 16 ns 16 ns 41959886 +// BM_MatrixVectorMultiplyDynamic/4/3 19 ns 19 ns 36674176 +// BM_MatrixVectorMultiplyDynamic/4/4 15 ns 15 ns 43561867 +// BM_MatrixVectorMultiplyDynamic/4/6 21 ns 21 ns 34278607 +// BM_MatrixVectorMultiplyDynamic/4/8 22 ns 22 ns 31484163 +// BM_MatrixVectorMultiplyDynamic/4/10 26 ns 26 ns 25605197 +// BM_MatrixVectorMultiplyDynamic/4/12 31 ns 31 ns 23380172 +// BM_MatrixVectorMultiplyDynamic/4/16 38 ns 38 ns 18054638 +// BM_MatrixVectorMultiplyDynamic/4/20 49 ns 49 ns 14771703 +void BM_MatrixVectorMultiplyDynamic(benchmark::State& state) { + const int rows = state.range(0); + const int cols = state.range(1); + MatrixVectorMultiplyData data(rows, cols); + const std::vector<double*> ptrs = data.ptrs(); + const int num_elements = data.num_elements(); + + int i = 0; + for (auto _ : state) { + double* a_ptr = ptrs[3 * i]; + double* b_ptr = ptrs[3 * i + 1]; + double* c_ptr = ptrs[3 * i + 2]; + internal::MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>( + a_ptr, rows, cols, b_ptr, c_ptr); + i = (i + 1) % num_elements; + } +} + +// Each ArgPair specifies a row and column size of the matrix. +BENCHMARK(BM_MatrixVectorMultiplyDynamic) +->ArgPair(1, 1) +->ArgPair(1, 2) +->ArgPair(1, 3) +->ArgPair(1, 4) +->ArgPair(1, 6) +->ArgPair(1, 8) +->ArgPair(1, 10) +->ArgPair(1, 12) +->ArgPair(1, 16) +->ArgPair(1, 20) +->ArgPair(2, 1) +->ArgPair(2, 2) +->ArgPair(2, 3) +->ArgPair(2, 4) +->ArgPair(2, 6) +->ArgPair(2, 8) +->ArgPair(2, 10) +->ArgPair(2, 12) +->ArgPair(2, 16) +->ArgPair(2, 20) +->ArgPair(3, 1) +->ArgPair(3, 2) +->ArgPair(3, 3) +->ArgPair(3, 4) +->ArgPair(3, 6) +->ArgPair(3, 8) +->ArgPair(3, 10) +->ArgPair(3, 12) +->ArgPair(3, 16) +->ArgPair(3, 20) +->ArgPair(4, 1) +->ArgPair(4, 2) +->ArgPair(4, 3) +->ArgPair(4, 4) +->ArgPair(4, 6) +->ArgPair(4, 8) +->ArgPair(4, 10) +->ArgPair(4, 12) +->ArgPair(4, 16) +->ArgPair(4, 20); + +// Run on (8 X 2200 MHz CPU s) +// 2018-02-06 21:18:17 +// ------------------------------------------------------------------------------------ +// Benchmark Time CPU Iterations +// ------------------------------------------------------------------------------------ +// BM_MatrixTransposeVectorMultiplyDynamic/1/1 5 ns 5 ns 139356174 +// BM_MatrixTransposeVectorMultiplyDynamic/1/2 6 ns 6 ns 120800041 +// BM_MatrixTransposeVectorMultiplyDynamic/1/3 7 ns 7 ns 100267858 +// BM_MatrixTransposeVectorMultiplyDynamic/1/4 9 ns 9 ns 70778564 +// BM_MatrixTransposeVectorMultiplyDynamic/1/6 14 ns 14 ns 47748651 +// BM_MatrixTransposeVectorMultiplyDynamic/1/8 16 ns 16 ns 43903663 +// BM_MatrixTransposeVectorMultiplyDynamic/1/10 18 ns 18 ns 34838177 +// BM_MatrixTransposeVectorMultiplyDynamic/1/12 20 ns 20 ns 36138731 +// BM_MatrixTransposeVectorMultiplyDynamic/1/16 23 ns 23 ns 27063704 +// BM_MatrixTransposeVectorMultiplyDynamic/1/20 29 ns 29 ns 23400336 +// BM_MatrixTransposeVectorMultiplyDynamic/2/1 6 ns 6 ns 121572101 +// BM_MatrixTransposeVectorMultiplyDynamic/2/2 8 ns 8 ns 82896155 +// BM_MatrixTransposeVectorMultiplyDynamic/2/3 12 ns 12 ns 56705415 +// BM_MatrixTransposeVectorMultiplyDynamic/2/4 14 ns 14 ns 51241509 +// BM_MatrixTransposeVectorMultiplyDynamic/2/6 18 ns 18 ns 38377403 +// BM_MatrixTransposeVectorMultiplyDynamic/2/8 25 ns 25 ns 28560121 +// BM_MatrixTransposeVectorMultiplyDynamic/2/10 29 ns 29 ns 23608052 +// BM_MatrixTransposeVectorMultiplyDynamic/2/12 33 ns 33 ns 20668478 +// BM_MatrixTransposeVectorMultiplyDynamic/2/16 44 ns 44 ns 16335446 +// BM_MatrixTransposeVectorMultiplyDynamic/2/20 53 ns 53 ns 13462315 +// BM_MatrixTransposeVectorMultiplyDynamic/3/1 6 ns 6 ns 117031415 +// BM_MatrixTransposeVectorMultiplyDynamic/3/2 10 ns 10 ns 71040747 +// BM_MatrixTransposeVectorMultiplyDynamic/3/3 14 ns 14 ns 49453538 +// BM_MatrixTransposeVectorMultiplyDynamic/3/4 17 ns 17 ns 39161935 +// BM_MatrixTransposeVectorMultiplyDynamic/3/6 22 ns 22 ns 32118490 +// BM_MatrixTransposeVectorMultiplyDynamic/3/8 28 ns 28 ns 25295689 +// BM_MatrixTransposeVectorMultiplyDynamic/3/10 34 ns 34 ns 20900389 +// BM_MatrixTransposeVectorMultiplyDynamic/3/12 39 ns 39 ns 17934922 +// BM_MatrixTransposeVectorMultiplyDynamic/3/16 51 ns 51 ns 10000000 +// BM_MatrixTransposeVectorMultiplyDynamic/3/20 64 ns 64 ns 10594824 +// BM_MatrixTransposeVectorMultiplyDynamic/4/1 7 ns 7 ns 98903583 +// BM_MatrixTransposeVectorMultiplyDynamic/4/2 13 ns 13 ns 57301899 +// BM_MatrixTransposeVectorMultiplyDynamic/4/3 16 ns 16 ns 44622083 +// BM_MatrixTransposeVectorMultiplyDynamic/4/4 18 ns 18 ns 39645007 +// BM_MatrixTransposeVectorMultiplyDynamic/4/6 26 ns 26 ns 27239262 +// BM_MatrixTransposeVectorMultiplyDynamic/4/8 33 ns 33 ns 20869171 +// BM_MatrixTransposeVectorMultiplyDynamic/4/10 39 ns 39 ns 17169614 +// BM_MatrixTransposeVectorMultiplyDynamic/4/12 47 ns 47 ns 15045286 +// BM_MatrixTransposeVectorMultiplyDynamic/4/16 62 ns 62 ns 11437535 +// BM_MatrixTransposeVectorMultiplyDynamic/4/20 77 ns 77 ns 8351428 +void BM_MatrixTransposeVectorMultiplyDynamic(benchmark::State& state) { + const int rows = state.range(0); + const int cols = state.range(1); + MatrixVectorMultiplyData data(rows, cols); + const std::vector<double*> ptrs = data.ptrs(); + const int num_elements = data.num_elements(); + + int i = 0; + for (auto _ : state) { + double* a_ptr = ptrs[3 * i]; + double* b_ptr = ptrs[3 * i + 1]; + double* c_ptr = ptrs[3 * i + 2]; + internal::MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>( + a_ptr, rows, cols, c_ptr, b_ptr); + i = (i + 1) % num_elements; + } +} + +// Each ArgPair specifies a row and column size of the matrix. +BENCHMARK(BM_MatrixTransposeVectorMultiplyDynamic) +->ArgPair(1, 1) +->ArgPair(1, 2) +->ArgPair(1, 3) +->ArgPair(1, 4) +->ArgPair(1, 6) +->ArgPair(1, 8) +->ArgPair(1, 10) +->ArgPair(1, 12) +->ArgPair(1, 16) +->ArgPair(1, 20) +->ArgPair(2, 1) +->ArgPair(2, 2) +->ArgPair(2, 3) +->ArgPair(2, 4) +->ArgPair(2, 6) +->ArgPair(2, 8) +->ArgPair(2, 10) +->ArgPair(2, 12) +->ArgPair(2, 16) +->ArgPair(2, 20) +->ArgPair(3, 1) +->ArgPair(3, 2) +->ArgPair(3, 3) +->ArgPair(3, 4) +->ArgPair(3, 6) +->ArgPair(3, 8) +->ArgPair(3, 10) +->ArgPair(3, 12) +->ArgPair(3, 16) +->ArgPair(3, 20) +->ArgPair(4, 1) +->ArgPair(4, 2) +->ArgPair(4, 3) +->ArgPair(4, 4) +->ArgPair(4, 6) +->ArgPair(4, 8) +->ArgPair(4, 10) +->ArgPair(4, 12) +->ArgPair(4, 16) +->ArgPair(4, 20); + +} // namespace ceres + +BENCHMARK_MAIN();