Add a benchmark for small_blas.h

This CL adds a benchmark for MatrixVectorMultiply and
MatrixTransposeVectorMultiply.

Change-Id: I50e1cec72c91dedf081b6f00681874abf545123b
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index c5fb259..884fd57 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -377,4 +377,8 @@
 if (BUILD_BENCHMARKS)
   add_executable(autodiff_cost_function_benchmark autodiff_cost_function_benchmark.cc)
   target_link_libraries(autodiff_cost_function_benchmark ceres benchmark::benchmark)
+
+  add_executable(small_blas_benchmark small_blas_benchmark.cc)
+  target_link_libraries(small_blas_benchmark ceres benchmark::benchmark)
+
 endif (BUILD_BENCHMARKS)
diff --git a/internal/ceres/small_blas_benchmark.cc b/internal/ceres/small_blas_benchmark.cc
new file mode 100644
index 0000000..c78a69d
--- /dev/null
+++ b/internal/ceres/small_blas_benchmark.cc
@@ -0,0 +1,296 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2018 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: sameeragarwal@google.com (Sameer Agarwal)
+
+#include <iostream>
+#include "Eigen/Dense"
+#include "benchmark/benchmark.h"
+#include "ceres/small_blas.h"
+
+namespace ceres {
+
+// Benchmarking matrix-vector multiply routines and optimizing memory
+// access requires that we make sure that they are not just sitting in
+// the cache. So, as the benchmarking routine iterates, we need to
+// multiply new/different matrice and vectors. Allocating/creating
+// these objects in the benchmarking loop is too heavy duty, so we
+// create them before hand and cycle through them in the
+// benchmark. This class, given the size of the matrix creates such
+// matrix and vector objects for use in the benchmark.
+class MatrixVectorMultiplyData {
+ public:
+  MatrixVectorMultiplyData(int rows, int cols) {
+    num_elements_ = 1000;
+    // A single memory buffer for all the matrices & vectors.
+
+    size_t buffer_size = num_elements_ * (200);
+    data_.resize(buffer_size, 1.00000000000001);
+
+    // Each element is three points, corresponding to the three
+    // elements of the expression c = A * b.
+    ptrs_.resize(3 * num_elements_, NULL);
+    double* p = &data_[0];
+    for (int i = 0; i < num_elements_; ++i) {
+      // Matrix X.
+      ptrs_[3 * i] = p;
+      p += rows * cols;
+      // Vector b.
+      ptrs_[3 * i + 1] = p;
+      p += cols;
+      // Vector c.
+      ptrs_[3 * i + 2] = p;
+      p += rows;
+    }
+  }
+
+  int num_elements() const { return num_elements_; }
+  double* data() { return &data_[0]; }
+  const std::vector<double*>& ptrs() const { return ptrs_; }
+
+ private:
+  int num_elements_;
+  std::vector<double> data_;
+  std::vector<double*> ptrs_;
+};
+
+// Run on (8 X 2200 MHz CPU s)
+// 2018-02-06 21:23:59
+// ---------------------------------------------------------------------------
+// Benchmark                                    Time           CPU Iterations
+// ---------------------------------------------------------------------------
+// BM_MatrixVectorMultiplyDynamic/1/1           4 ns          4 ns  165611093
+// BM_MatrixVectorMultiplyDynamic/1/2           5 ns          5 ns  140648672
+// BM_MatrixVectorMultiplyDynamic/1/3           5 ns          5 ns  139414459
+// BM_MatrixVectorMultiplyDynamic/1/4           5 ns          5 ns  144247512
+// BM_MatrixVectorMultiplyDynamic/1/6           6 ns          6 ns  106639042
+// BM_MatrixVectorMultiplyDynamic/1/8           7 ns          7 ns  102367617
+// BM_MatrixVectorMultiplyDynamic/1/10          9 ns          9 ns   82419847
+// BM_MatrixVectorMultiplyDynamic/1/12         10 ns         10 ns   65129002
+// BM_MatrixVectorMultiplyDynamic/1/16         12 ns         12 ns   53500867
+// BM_MatrixVectorMultiplyDynamic/1/20         16 ns         16 ns   46067179
+// BM_MatrixVectorMultiplyDynamic/2/1           5 ns          5 ns  128880215
+// BM_MatrixVectorMultiplyDynamic/2/2           8 ns          8 ns   81938429
+// BM_MatrixVectorMultiplyDynamic/2/3          10 ns         10 ns   68807565
+// BM_MatrixVectorMultiplyDynamic/2/4           8 ns          8 ns   91833388
+// BM_MatrixVectorMultiplyDynamic/2/6          10 ns         10 ns   64031028
+// BM_MatrixVectorMultiplyDynamic/2/8          12 ns         12 ns   59788179
+// BM_MatrixVectorMultiplyDynamic/2/10         15 ns         15 ns   44737868
+// BM_MatrixVectorMultiplyDynamic/2/12         17 ns         17 ns   37423949
+// BM_MatrixVectorMultiplyDynamic/2/16         22 ns         22 ns   33470723
+// BM_MatrixVectorMultiplyDynamic/2/20         26 ns         26 ns   27076057
+// BM_MatrixVectorMultiplyDynamic/3/1           6 ns          6 ns  100932908
+// BM_MatrixVectorMultiplyDynamic/3/2          12 ns         12 ns   65591589
+// BM_MatrixVectorMultiplyDynamic/3/3          14 ns         14 ns   48182819
+// BM_MatrixVectorMultiplyDynamic/3/4          11 ns         11 ns   61770338
+// BM_MatrixVectorMultiplyDynamic/3/6          15 ns         15 ns   44712435
+// BM_MatrixVectorMultiplyDynamic/3/8          18 ns         18 ns   35177294
+// BM_MatrixVectorMultiplyDynamic/3/10         21 ns         21 ns   32164683
+// BM_MatrixVectorMultiplyDynamic/3/12         24 ns         24 ns   28222279
+// BM_MatrixVectorMultiplyDynamic/3/16         30 ns         30 ns   23050731
+// BM_MatrixVectorMultiplyDynamic/3/20         38 ns         38 ns   17832714
+// BM_MatrixVectorMultiplyDynamic/4/1           8 ns          8 ns   85763293
+// BM_MatrixVectorMultiplyDynamic/4/2          16 ns         16 ns   41959886
+// BM_MatrixVectorMultiplyDynamic/4/3          19 ns         19 ns   36674176
+// BM_MatrixVectorMultiplyDynamic/4/4          15 ns         15 ns   43561867
+// BM_MatrixVectorMultiplyDynamic/4/6          21 ns         21 ns   34278607
+// BM_MatrixVectorMultiplyDynamic/4/8          22 ns         22 ns   31484163
+// BM_MatrixVectorMultiplyDynamic/4/10         26 ns         26 ns   25605197
+// BM_MatrixVectorMultiplyDynamic/4/12         31 ns         31 ns   23380172
+// BM_MatrixVectorMultiplyDynamic/4/16         38 ns         38 ns   18054638
+// BM_MatrixVectorMultiplyDynamic/4/20         49 ns         49 ns   14771703
+void BM_MatrixVectorMultiplyDynamic(benchmark::State& state) {
+  const int rows = state.range(0);
+  const int cols = state.range(1);
+  MatrixVectorMultiplyData data(rows, cols);
+  const std::vector<double*> ptrs = data.ptrs();
+  const int num_elements = data.num_elements();
+
+  int i = 0;
+  for (auto _ : state) {
+    double* a_ptr = ptrs[3 * i];
+    double* b_ptr = ptrs[3 * i + 1];
+    double* c_ptr = ptrs[3 * i + 2];
+    internal::MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+        a_ptr, rows, cols, b_ptr, c_ptr);
+    i = (i + 1) % num_elements;
+  }
+}
+
+// Each ArgPair specifies a row and column size of the matrix.
+BENCHMARK(BM_MatrixVectorMultiplyDynamic)
+->ArgPair(1, 1)
+->ArgPair(1, 2)
+->ArgPair(1, 3)
+->ArgPair(1, 4)
+->ArgPair(1, 6)
+->ArgPair(1, 8)
+->ArgPair(1, 10)
+->ArgPair(1, 12)
+->ArgPair(1, 16)
+->ArgPair(1, 20)
+->ArgPair(2, 1)
+->ArgPair(2, 2)
+->ArgPair(2, 3)
+->ArgPair(2, 4)
+->ArgPair(2, 6)
+->ArgPair(2, 8)
+->ArgPair(2, 10)
+->ArgPair(2, 12)
+->ArgPair(2, 16)
+->ArgPair(2, 20)
+->ArgPair(3, 1)
+->ArgPair(3, 2)
+->ArgPair(3, 3)
+->ArgPair(3, 4)
+->ArgPair(3, 6)
+->ArgPair(3, 8)
+->ArgPair(3, 10)
+->ArgPair(3, 12)
+->ArgPair(3, 16)
+->ArgPair(3, 20)
+->ArgPair(4, 1)
+->ArgPair(4, 2)
+->ArgPair(4, 3)
+->ArgPair(4, 4)
+->ArgPair(4, 6)
+->ArgPair(4, 8)
+->ArgPair(4, 10)
+->ArgPair(4, 12)
+->ArgPair(4, 16)
+->ArgPair(4, 20);
+
+// Run on (8 X 2200 MHz CPU s)
+// 2018-02-06 21:18:17
+// ------------------------------------------------------------------------------------
+// Benchmark                                             Time           CPU Iterations
+// ------------------------------------------------------------------------------------
+// BM_MatrixTransposeVectorMultiplyDynamic/1/1           5 ns          5 ns  139356174
+// BM_MatrixTransposeVectorMultiplyDynamic/1/2           6 ns          6 ns  120800041
+// BM_MatrixTransposeVectorMultiplyDynamic/1/3           7 ns          7 ns  100267858
+// BM_MatrixTransposeVectorMultiplyDynamic/1/4           9 ns          9 ns   70778564
+// BM_MatrixTransposeVectorMultiplyDynamic/1/6          14 ns         14 ns   47748651
+// BM_MatrixTransposeVectorMultiplyDynamic/1/8          16 ns         16 ns   43903663
+// BM_MatrixTransposeVectorMultiplyDynamic/1/10         18 ns         18 ns   34838177
+// BM_MatrixTransposeVectorMultiplyDynamic/1/12         20 ns         20 ns   36138731
+// BM_MatrixTransposeVectorMultiplyDynamic/1/16         23 ns         23 ns   27063704
+// BM_MatrixTransposeVectorMultiplyDynamic/1/20         29 ns         29 ns   23400336
+// BM_MatrixTransposeVectorMultiplyDynamic/2/1           6 ns          6 ns  121572101
+// BM_MatrixTransposeVectorMultiplyDynamic/2/2           8 ns          8 ns   82896155
+// BM_MatrixTransposeVectorMultiplyDynamic/2/3          12 ns         12 ns   56705415
+// BM_MatrixTransposeVectorMultiplyDynamic/2/4          14 ns         14 ns   51241509
+// BM_MatrixTransposeVectorMultiplyDynamic/2/6          18 ns         18 ns   38377403
+// BM_MatrixTransposeVectorMultiplyDynamic/2/8          25 ns         25 ns   28560121
+// BM_MatrixTransposeVectorMultiplyDynamic/2/10         29 ns         29 ns   23608052
+// BM_MatrixTransposeVectorMultiplyDynamic/2/12         33 ns         33 ns   20668478
+// BM_MatrixTransposeVectorMultiplyDynamic/2/16         44 ns         44 ns   16335446
+// BM_MatrixTransposeVectorMultiplyDynamic/2/20         53 ns         53 ns   13462315
+// BM_MatrixTransposeVectorMultiplyDynamic/3/1           6 ns          6 ns  117031415
+// BM_MatrixTransposeVectorMultiplyDynamic/3/2          10 ns         10 ns   71040747
+// BM_MatrixTransposeVectorMultiplyDynamic/3/3          14 ns         14 ns   49453538
+// BM_MatrixTransposeVectorMultiplyDynamic/3/4          17 ns         17 ns   39161935
+// BM_MatrixTransposeVectorMultiplyDynamic/3/6          22 ns         22 ns   32118490
+// BM_MatrixTransposeVectorMultiplyDynamic/3/8          28 ns         28 ns   25295689
+// BM_MatrixTransposeVectorMultiplyDynamic/3/10         34 ns         34 ns   20900389
+// BM_MatrixTransposeVectorMultiplyDynamic/3/12         39 ns         39 ns   17934922
+// BM_MatrixTransposeVectorMultiplyDynamic/3/16         51 ns         51 ns   10000000
+// BM_MatrixTransposeVectorMultiplyDynamic/3/20         64 ns         64 ns   10594824
+// BM_MatrixTransposeVectorMultiplyDynamic/4/1           7 ns          7 ns   98903583
+// BM_MatrixTransposeVectorMultiplyDynamic/4/2          13 ns         13 ns   57301899
+// BM_MatrixTransposeVectorMultiplyDynamic/4/3          16 ns         16 ns   44622083
+// BM_MatrixTransposeVectorMultiplyDynamic/4/4          18 ns         18 ns   39645007
+// BM_MatrixTransposeVectorMultiplyDynamic/4/6          26 ns         26 ns   27239262
+// BM_MatrixTransposeVectorMultiplyDynamic/4/8          33 ns         33 ns   20869171
+// BM_MatrixTransposeVectorMultiplyDynamic/4/10         39 ns         39 ns   17169614
+// BM_MatrixTransposeVectorMultiplyDynamic/4/12         47 ns         47 ns   15045286
+// BM_MatrixTransposeVectorMultiplyDynamic/4/16         62 ns         62 ns   11437535
+// BM_MatrixTransposeVectorMultiplyDynamic/4/20         77 ns         77 ns    8351428
+void BM_MatrixTransposeVectorMultiplyDynamic(benchmark::State& state) {
+  const int rows = state.range(0);
+  const int cols = state.range(1);
+  MatrixVectorMultiplyData data(rows, cols);
+  const std::vector<double*> ptrs = data.ptrs();
+  const int num_elements = data.num_elements();
+
+  int i = 0;
+  for (auto _ : state) {
+    double* a_ptr = ptrs[3 * i];
+    double* b_ptr = ptrs[3 * i + 1];
+    double* c_ptr = ptrs[3 * i + 2];
+    internal::MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+        a_ptr, rows, cols, c_ptr, b_ptr);
+    i = (i + 1) % num_elements;
+  }
+}
+
+// Each ArgPair specifies a row and column size of the matrix.
+BENCHMARK(BM_MatrixTransposeVectorMultiplyDynamic)
+->ArgPair(1, 1)
+->ArgPair(1, 2)
+->ArgPair(1, 3)
+->ArgPair(1, 4)
+->ArgPair(1, 6)
+->ArgPair(1, 8)
+->ArgPair(1, 10)
+->ArgPair(1, 12)
+->ArgPair(1, 16)
+->ArgPair(1, 20)
+->ArgPair(2, 1)
+->ArgPair(2, 2)
+->ArgPair(2, 3)
+->ArgPair(2, 4)
+->ArgPair(2, 6)
+->ArgPair(2, 8)
+->ArgPair(2, 10)
+->ArgPair(2, 12)
+->ArgPair(2, 16)
+->ArgPair(2, 20)
+->ArgPair(3, 1)
+->ArgPair(3, 2)
+->ArgPair(3, 3)
+->ArgPair(3, 4)
+->ArgPair(3, 6)
+->ArgPair(3, 8)
+->ArgPair(3, 10)
+->ArgPair(3, 12)
+->ArgPair(3, 16)
+->ArgPair(3, 20)
+->ArgPair(4, 1)
+->ArgPair(4, 2)
+->ArgPair(4, 3)
+->ArgPair(4, 4)
+->ArgPair(4, 6)
+->ArgPair(4, 8)
+->ArgPair(4, 10)
+->ArgPair(4, 12)
+->ArgPair(4, 16)
+->ArgPair(4, 20);
+
+}  // namespace ceres
+
+BENCHMARK_MAIN();