Add a benchmark for small_blas.h
This CL adds a benchmark for MatrixVectorMultiply and
MatrixTransposeVectorMultiply.
Change-Id: I50e1cec72c91dedf081b6f00681874abf545123b
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index c5fb259..884fd57 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -377,4 +377,8 @@
if (BUILD_BENCHMARKS)
add_executable(autodiff_cost_function_benchmark autodiff_cost_function_benchmark.cc)
target_link_libraries(autodiff_cost_function_benchmark ceres benchmark::benchmark)
+
+ add_executable(small_blas_benchmark small_blas_benchmark.cc)
+ target_link_libraries(small_blas_benchmark ceres benchmark::benchmark)
+
endif (BUILD_BENCHMARKS)
diff --git a/internal/ceres/small_blas_benchmark.cc b/internal/ceres/small_blas_benchmark.cc
new file mode 100644
index 0000000..c78a69d
--- /dev/null
+++ b/internal/ceres/small_blas_benchmark.cc
@@ -0,0 +1,296 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2018 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: sameeragarwal@google.com (Sameer Agarwal)
+
+#include <iostream>
+#include "Eigen/Dense"
+#include "benchmark/benchmark.h"
+#include "ceres/small_blas.h"
+
+namespace ceres {
+
+// Benchmarking matrix-vector multiply routines and optimizing memory
+// access requires that we make sure that they are not just sitting in
+// the cache. So, as the benchmarking routine iterates, we need to
+// multiply new/different matrice and vectors. Allocating/creating
+// these objects in the benchmarking loop is too heavy duty, so we
+// create them before hand and cycle through them in the
+// benchmark. This class, given the size of the matrix creates such
+// matrix and vector objects for use in the benchmark.
+class MatrixVectorMultiplyData {
+ public:
+ MatrixVectorMultiplyData(int rows, int cols) {
+ num_elements_ = 1000;
+ // A single memory buffer for all the matrices & vectors.
+
+ size_t buffer_size = num_elements_ * (200);
+ data_.resize(buffer_size, 1.00000000000001);
+
+ // Each element is three points, corresponding to the three
+ // elements of the expression c = A * b.
+ ptrs_.resize(3 * num_elements_, NULL);
+ double* p = &data_[0];
+ for (int i = 0; i < num_elements_; ++i) {
+ // Matrix X.
+ ptrs_[3 * i] = p;
+ p += rows * cols;
+ // Vector b.
+ ptrs_[3 * i + 1] = p;
+ p += cols;
+ // Vector c.
+ ptrs_[3 * i + 2] = p;
+ p += rows;
+ }
+ }
+
+ int num_elements() const { return num_elements_; }
+ double* data() { return &data_[0]; }
+ const std::vector<double*>& ptrs() const { return ptrs_; }
+
+ private:
+ int num_elements_;
+ std::vector<double> data_;
+ std::vector<double*> ptrs_;
+};
+
+// Run on (8 X 2200 MHz CPU s)
+// 2018-02-06 21:23:59
+// ---------------------------------------------------------------------------
+// Benchmark Time CPU Iterations
+// ---------------------------------------------------------------------------
+// BM_MatrixVectorMultiplyDynamic/1/1 4 ns 4 ns 165611093
+// BM_MatrixVectorMultiplyDynamic/1/2 5 ns 5 ns 140648672
+// BM_MatrixVectorMultiplyDynamic/1/3 5 ns 5 ns 139414459
+// BM_MatrixVectorMultiplyDynamic/1/4 5 ns 5 ns 144247512
+// BM_MatrixVectorMultiplyDynamic/1/6 6 ns 6 ns 106639042
+// BM_MatrixVectorMultiplyDynamic/1/8 7 ns 7 ns 102367617
+// BM_MatrixVectorMultiplyDynamic/1/10 9 ns 9 ns 82419847
+// BM_MatrixVectorMultiplyDynamic/1/12 10 ns 10 ns 65129002
+// BM_MatrixVectorMultiplyDynamic/1/16 12 ns 12 ns 53500867
+// BM_MatrixVectorMultiplyDynamic/1/20 16 ns 16 ns 46067179
+// BM_MatrixVectorMultiplyDynamic/2/1 5 ns 5 ns 128880215
+// BM_MatrixVectorMultiplyDynamic/2/2 8 ns 8 ns 81938429
+// BM_MatrixVectorMultiplyDynamic/2/3 10 ns 10 ns 68807565
+// BM_MatrixVectorMultiplyDynamic/2/4 8 ns 8 ns 91833388
+// BM_MatrixVectorMultiplyDynamic/2/6 10 ns 10 ns 64031028
+// BM_MatrixVectorMultiplyDynamic/2/8 12 ns 12 ns 59788179
+// BM_MatrixVectorMultiplyDynamic/2/10 15 ns 15 ns 44737868
+// BM_MatrixVectorMultiplyDynamic/2/12 17 ns 17 ns 37423949
+// BM_MatrixVectorMultiplyDynamic/2/16 22 ns 22 ns 33470723
+// BM_MatrixVectorMultiplyDynamic/2/20 26 ns 26 ns 27076057
+// BM_MatrixVectorMultiplyDynamic/3/1 6 ns 6 ns 100932908
+// BM_MatrixVectorMultiplyDynamic/3/2 12 ns 12 ns 65591589
+// BM_MatrixVectorMultiplyDynamic/3/3 14 ns 14 ns 48182819
+// BM_MatrixVectorMultiplyDynamic/3/4 11 ns 11 ns 61770338
+// BM_MatrixVectorMultiplyDynamic/3/6 15 ns 15 ns 44712435
+// BM_MatrixVectorMultiplyDynamic/3/8 18 ns 18 ns 35177294
+// BM_MatrixVectorMultiplyDynamic/3/10 21 ns 21 ns 32164683
+// BM_MatrixVectorMultiplyDynamic/3/12 24 ns 24 ns 28222279
+// BM_MatrixVectorMultiplyDynamic/3/16 30 ns 30 ns 23050731
+// BM_MatrixVectorMultiplyDynamic/3/20 38 ns 38 ns 17832714
+// BM_MatrixVectorMultiplyDynamic/4/1 8 ns 8 ns 85763293
+// BM_MatrixVectorMultiplyDynamic/4/2 16 ns 16 ns 41959886
+// BM_MatrixVectorMultiplyDynamic/4/3 19 ns 19 ns 36674176
+// BM_MatrixVectorMultiplyDynamic/4/4 15 ns 15 ns 43561867
+// BM_MatrixVectorMultiplyDynamic/4/6 21 ns 21 ns 34278607
+// BM_MatrixVectorMultiplyDynamic/4/8 22 ns 22 ns 31484163
+// BM_MatrixVectorMultiplyDynamic/4/10 26 ns 26 ns 25605197
+// BM_MatrixVectorMultiplyDynamic/4/12 31 ns 31 ns 23380172
+// BM_MatrixVectorMultiplyDynamic/4/16 38 ns 38 ns 18054638
+// BM_MatrixVectorMultiplyDynamic/4/20 49 ns 49 ns 14771703
+void BM_MatrixVectorMultiplyDynamic(benchmark::State& state) {
+ const int rows = state.range(0);
+ const int cols = state.range(1);
+ MatrixVectorMultiplyData data(rows, cols);
+ const std::vector<double*> ptrs = data.ptrs();
+ const int num_elements = data.num_elements();
+
+ int i = 0;
+ for (auto _ : state) {
+ double* a_ptr = ptrs[3 * i];
+ double* b_ptr = ptrs[3 * i + 1];
+ double* c_ptr = ptrs[3 * i + 2];
+ internal::MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+ a_ptr, rows, cols, b_ptr, c_ptr);
+ i = (i + 1) % num_elements;
+ }
+}
+
+// Each ArgPair specifies a row and column size of the matrix.
+BENCHMARK(BM_MatrixVectorMultiplyDynamic)
+->ArgPair(1, 1)
+->ArgPair(1, 2)
+->ArgPair(1, 3)
+->ArgPair(1, 4)
+->ArgPair(1, 6)
+->ArgPair(1, 8)
+->ArgPair(1, 10)
+->ArgPair(1, 12)
+->ArgPair(1, 16)
+->ArgPair(1, 20)
+->ArgPair(2, 1)
+->ArgPair(2, 2)
+->ArgPair(2, 3)
+->ArgPair(2, 4)
+->ArgPair(2, 6)
+->ArgPair(2, 8)
+->ArgPair(2, 10)
+->ArgPair(2, 12)
+->ArgPair(2, 16)
+->ArgPair(2, 20)
+->ArgPair(3, 1)
+->ArgPair(3, 2)
+->ArgPair(3, 3)
+->ArgPair(3, 4)
+->ArgPair(3, 6)
+->ArgPair(3, 8)
+->ArgPair(3, 10)
+->ArgPair(3, 12)
+->ArgPair(3, 16)
+->ArgPair(3, 20)
+->ArgPair(4, 1)
+->ArgPair(4, 2)
+->ArgPair(4, 3)
+->ArgPair(4, 4)
+->ArgPair(4, 6)
+->ArgPair(4, 8)
+->ArgPair(4, 10)
+->ArgPair(4, 12)
+->ArgPair(4, 16)
+->ArgPair(4, 20);
+
+// Run on (8 X 2200 MHz CPU s)
+// 2018-02-06 21:18:17
+// ------------------------------------------------------------------------------------
+// Benchmark Time CPU Iterations
+// ------------------------------------------------------------------------------------
+// BM_MatrixTransposeVectorMultiplyDynamic/1/1 5 ns 5 ns 139356174
+// BM_MatrixTransposeVectorMultiplyDynamic/1/2 6 ns 6 ns 120800041
+// BM_MatrixTransposeVectorMultiplyDynamic/1/3 7 ns 7 ns 100267858
+// BM_MatrixTransposeVectorMultiplyDynamic/1/4 9 ns 9 ns 70778564
+// BM_MatrixTransposeVectorMultiplyDynamic/1/6 14 ns 14 ns 47748651
+// BM_MatrixTransposeVectorMultiplyDynamic/1/8 16 ns 16 ns 43903663
+// BM_MatrixTransposeVectorMultiplyDynamic/1/10 18 ns 18 ns 34838177
+// BM_MatrixTransposeVectorMultiplyDynamic/1/12 20 ns 20 ns 36138731
+// BM_MatrixTransposeVectorMultiplyDynamic/1/16 23 ns 23 ns 27063704
+// BM_MatrixTransposeVectorMultiplyDynamic/1/20 29 ns 29 ns 23400336
+// BM_MatrixTransposeVectorMultiplyDynamic/2/1 6 ns 6 ns 121572101
+// BM_MatrixTransposeVectorMultiplyDynamic/2/2 8 ns 8 ns 82896155
+// BM_MatrixTransposeVectorMultiplyDynamic/2/3 12 ns 12 ns 56705415
+// BM_MatrixTransposeVectorMultiplyDynamic/2/4 14 ns 14 ns 51241509
+// BM_MatrixTransposeVectorMultiplyDynamic/2/6 18 ns 18 ns 38377403
+// BM_MatrixTransposeVectorMultiplyDynamic/2/8 25 ns 25 ns 28560121
+// BM_MatrixTransposeVectorMultiplyDynamic/2/10 29 ns 29 ns 23608052
+// BM_MatrixTransposeVectorMultiplyDynamic/2/12 33 ns 33 ns 20668478
+// BM_MatrixTransposeVectorMultiplyDynamic/2/16 44 ns 44 ns 16335446
+// BM_MatrixTransposeVectorMultiplyDynamic/2/20 53 ns 53 ns 13462315
+// BM_MatrixTransposeVectorMultiplyDynamic/3/1 6 ns 6 ns 117031415
+// BM_MatrixTransposeVectorMultiplyDynamic/3/2 10 ns 10 ns 71040747
+// BM_MatrixTransposeVectorMultiplyDynamic/3/3 14 ns 14 ns 49453538
+// BM_MatrixTransposeVectorMultiplyDynamic/3/4 17 ns 17 ns 39161935
+// BM_MatrixTransposeVectorMultiplyDynamic/3/6 22 ns 22 ns 32118490
+// BM_MatrixTransposeVectorMultiplyDynamic/3/8 28 ns 28 ns 25295689
+// BM_MatrixTransposeVectorMultiplyDynamic/3/10 34 ns 34 ns 20900389
+// BM_MatrixTransposeVectorMultiplyDynamic/3/12 39 ns 39 ns 17934922
+// BM_MatrixTransposeVectorMultiplyDynamic/3/16 51 ns 51 ns 10000000
+// BM_MatrixTransposeVectorMultiplyDynamic/3/20 64 ns 64 ns 10594824
+// BM_MatrixTransposeVectorMultiplyDynamic/4/1 7 ns 7 ns 98903583
+// BM_MatrixTransposeVectorMultiplyDynamic/4/2 13 ns 13 ns 57301899
+// BM_MatrixTransposeVectorMultiplyDynamic/4/3 16 ns 16 ns 44622083
+// BM_MatrixTransposeVectorMultiplyDynamic/4/4 18 ns 18 ns 39645007
+// BM_MatrixTransposeVectorMultiplyDynamic/4/6 26 ns 26 ns 27239262
+// BM_MatrixTransposeVectorMultiplyDynamic/4/8 33 ns 33 ns 20869171
+// BM_MatrixTransposeVectorMultiplyDynamic/4/10 39 ns 39 ns 17169614
+// BM_MatrixTransposeVectorMultiplyDynamic/4/12 47 ns 47 ns 15045286
+// BM_MatrixTransposeVectorMultiplyDynamic/4/16 62 ns 62 ns 11437535
+// BM_MatrixTransposeVectorMultiplyDynamic/4/20 77 ns 77 ns 8351428
+void BM_MatrixTransposeVectorMultiplyDynamic(benchmark::State& state) {
+ const int rows = state.range(0);
+ const int cols = state.range(1);
+ MatrixVectorMultiplyData data(rows, cols);
+ const std::vector<double*> ptrs = data.ptrs();
+ const int num_elements = data.num_elements();
+
+ int i = 0;
+ for (auto _ : state) {
+ double* a_ptr = ptrs[3 * i];
+ double* b_ptr = ptrs[3 * i + 1];
+ double* c_ptr = ptrs[3 * i + 2];
+ internal::MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+ a_ptr, rows, cols, c_ptr, b_ptr);
+ i = (i + 1) % num_elements;
+ }
+}
+
+// Each ArgPair specifies a row and column size of the matrix.
+BENCHMARK(BM_MatrixTransposeVectorMultiplyDynamic)
+->ArgPair(1, 1)
+->ArgPair(1, 2)
+->ArgPair(1, 3)
+->ArgPair(1, 4)
+->ArgPair(1, 6)
+->ArgPair(1, 8)
+->ArgPair(1, 10)
+->ArgPair(1, 12)
+->ArgPair(1, 16)
+->ArgPair(1, 20)
+->ArgPair(2, 1)
+->ArgPair(2, 2)
+->ArgPair(2, 3)
+->ArgPair(2, 4)
+->ArgPair(2, 6)
+->ArgPair(2, 8)
+->ArgPair(2, 10)
+->ArgPair(2, 12)
+->ArgPair(2, 16)
+->ArgPair(2, 20)
+->ArgPair(3, 1)
+->ArgPair(3, 2)
+->ArgPair(3, 3)
+->ArgPair(3, 4)
+->ArgPair(3, 6)
+->ArgPair(3, 8)
+->ArgPair(3, 10)
+->ArgPair(3, 12)
+->ArgPair(3, 16)
+->ArgPair(3, 20)
+->ArgPair(4, 1)
+->ArgPair(4, 2)
+->ArgPair(4, 3)
+->ArgPair(4, 4)
+->ArgPair(4, 6)
+->ArgPair(4, 8)
+->ArgPair(4, 10)
+->ArgPair(4, 12)
+->ArgPair(4, 16)
+->ArgPair(4, 20);
+
+} // namespace ceres
+
+BENCHMARK_MAIN();