internal/ceres/small_blas_benchmark.cc - ceres-solver - Git at Google

 // Ceres Solver - A fast non-linear least squares minimizer
 // Copyright 2018 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // * Redistributions of source code must retain the above copyright notice,
 //   this list of conditions and the following disclaimer.
 // * Redistributions in binary form must reproduce the above copyright notice,
 //   this list of conditions and the following disclaimer in the documentation
 //   and/or other materials provided with the distribution.
 // * Neither the name of Google Inc. nor the names of its contributors may be
 //   used to endorse or promote products derived from this software without
 //   specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Authors: sameeragarwal@google.com (Sameer Agarwal)

 #include <iostream>
 #include "Eigen/Dense"
 #include "benchmark/benchmark.h"
 #include "ceres/small_blas.h"

 namespace ceres {

 // Benchmarking matrix-vector multiply routines and optimizing memory
 // access requires that we make sure that they are not just sitting in
 // the cache. So, as the benchmarking routine iterates, we need to
 // multiply new/different matrice and vectors. Allocating/creating
 // these objects in the benchmarking loop is too heavy duty, so we
 // create them before hand and cycle through them in the
 // benchmark. This class, given the size of the matrix creates such
 // matrix and vector objects for use in the benchmark.
 class MatrixVectorMultiplyData {
  public:
   MatrixVectorMultiplyData(int rows, int cols) {
     num_elements_ = 1000;
     // A single memory buffer for all the matrices & vectors.

     size_t buffer_size = num_elements_ * (200);
     data_.resize(buffer_size, 1.00000000000001);

     // Each element is three points, corresponding to the three
     // elements of the expression c = A * b.
     ptrs_.resize(3 * num_elements_, NULL);
     double* p = &data_[0];
     for (int i = 0; i < num_elements_; ++i) {
       // Matrix X.
       ptrs_[3 * i] = p;
       p += rows * cols;
       // Vector b.
       ptrs_[3 * i + 1] = p;
       p += cols;
       // Vector c.
       ptrs_[3 * i + 2] = p;
       p += rows;
     }
   }

   int num_elements() const { return num_elements_; }
   double* data() { return &data_[0]; }
   const std::vector<double*>& ptrs() const { return ptrs_; }

  private:
   int num_elements_;
   std::vector<double> data_;
   std::vector<double*> ptrs_;
 };

 // Run on (8 X 2200 MHz CPU s)
 // 2018-02-06 21:23:59
 // ---------------------------------------------------------------------------
 // Benchmark                                    Time           CPU Iterations
 // ---------------------------------------------------------------------------
 // BM_MatrixVectorMultiplyDynamic/1/1           4 ns          4 ns  165611093
 // BM_MatrixVectorMultiplyDynamic/1/2           5 ns          5 ns  140648672
 // BM_MatrixVectorMultiplyDynamic/1/3           5 ns          5 ns  139414459
 // BM_MatrixVectorMultiplyDynamic/1/4           5 ns          5 ns  144247512
 // BM_MatrixVectorMultiplyDynamic/1/6           6 ns          6 ns  106639042
 // BM_MatrixVectorMultiplyDynamic/1/8           7 ns          7 ns  102367617
 // BM_MatrixVectorMultiplyDynamic/1/10          9 ns          9 ns   82419847
 // BM_MatrixVectorMultiplyDynamic/1/12         10 ns         10 ns   65129002
 // BM_MatrixVectorMultiplyDynamic/1/16         12 ns         12 ns   53500867
 // BM_MatrixVectorMultiplyDynamic/1/20         16 ns         16 ns   46067179
 // BM_MatrixVectorMultiplyDynamic/2/1           5 ns          5 ns  128880215
 // BM_MatrixVectorMultiplyDynamic/2/2           8 ns          8 ns   81938429
 // BM_MatrixVectorMultiplyDynamic/2/3          10 ns         10 ns   68807565
 // BM_MatrixVectorMultiplyDynamic/2/4           8 ns          8 ns   91833388
 // BM_MatrixVectorMultiplyDynamic/2/6          10 ns         10 ns   64031028
 // BM_MatrixVectorMultiplyDynamic/2/8          12 ns         12 ns   59788179
 // BM_MatrixVectorMultiplyDynamic/2/10         15 ns         15 ns   44737868
 // BM_MatrixVectorMultiplyDynamic/2/12         17 ns         17 ns   37423949
 // BM_MatrixVectorMultiplyDynamic/2/16         22 ns         22 ns   33470723
 // BM_MatrixVectorMultiplyDynamic/2/20         26 ns         26 ns   27076057
 // BM_MatrixVectorMultiplyDynamic/3/1           6 ns          6 ns  100932908
 // BM_MatrixVectorMultiplyDynamic/3/2          12 ns         12 ns   65591589
 // BM_MatrixVectorMultiplyDynamic/3/3          14 ns         14 ns   48182819
 // BM_MatrixVectorMultiplyDynamic/3/4          11 ns         11 ns   61770338
 // BM_MatrixVectorMultiplyDynamic/3/6          15 ns         15 ns   44712435
 // BM_MatrixVectorMultiplyDynamic/3/8          18 ns         18 ns   35177294
 // BM_MatrixVectorMultiplyDynamic/3/10         21 ns         21 ns   32164683
 // BM_MatrixVectorMultiplyDynamic/3/12         24 ns         24 ns   28222279
 // BM_MatrixVectorMultiplyDynamic/3/16         30 ns         30 ns   23050731
 // BM_MatrixVectorMultiplyDynamic/3/20         38 ns         38 ns   17832714
 // BM_MatrixVectorMultiplyDynamic/4/1           8 ns          8 ns   85763293
 // BM_MatrixVectorMultiplyDynamic/4/2          16 ns         16 ns   41959886
 // BM_MatrixVectorMultiplyDynamic/4/3          19 ns         19 ns   36674176
 // BM_MatrixVectorMultiplyDynamic/4/4          15 ns         15 ns   43561867
 // BM_MatrixVectorMultiplyDynamic/4/6          21 ns         21 ns   34278607
 // BM_MatrixVectorMultiplyDynamic/4/8          22 ns         22 ns   31484163
 // BM_MatrixVectorMultiplyDynamic/4/10         26 ns         26 ns   25605197
 // BM_MatrixVectorMultiplyDynamic/4/12         31 ns         31 ns   23380172
 // BM_MatrixVectorMultiplyDynamic/4/16         38 ns         38 ns   18054638
 // BM_MatrixVectorMultiplyDynamic/4/20         49 ns         49 ns   14771703
 void BM_MatrixVectorMultiplyDynamic(benchmark::State& state) {
   const int rows = state.range(0);
   const int cols = state.range(1);
   MatrixVectorMultiplyData data(rows, cols);
   const std::vector<double*> ptrs = data.ptrs();
   const int num_elements = data.num_elements();

   int i = 0;
   for (auto _ : state) {
     double* a_ptr = ptrs[3 * i];
     double* b_ptr = ptrs[3 * i + 1];
     double* c_ptr = ptrs[3 * i + 2];
     internal::MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
         a_ptr, rows, cols, b_ptr, c_ptr);
     i = (i + 1) % num_elements;
   }
 }

 // Each ArgPair specifies a row and column size of the matrix.
 BENCHMARK(BM_MatrixVectorMultiplyDynamic)
 ->ArgPair(1, 1)
 ->ArgPair(1, 2)
 ->ArgPair(1, 3)
 ->ArgPair(1, 4)
 ->ArgPair(1, 6)
 ->ArgPair(1, 8)
 ->ArgPair(1, 10)
 ->ArgPair(1, 12)
 ->ArgPair(1, 16)
 ->ArgPair(1, 20)
 ->ArgPair(2, 1)
 ->ArgPair(2, 2)
 ->ArgPair(2, 3)
 ->ArgPair(2, 4)
 ->ArgPair(2, 6)
 ->ArgPair(2, 8)
 ->ArgPair(2, 10)
 ->ArgPair(2, 12)
 ->ArgPair(2, 16)
 ->ArgPair(2, 20)
 ->ArgPair(3, 1)
 ->ArgPair(3, 2)
 ->ArgPair(3, 3)
 ->ArgPair(3, 4)
 ->ArgPair(3, 6)
 ->ArgPair(3, 8)
 ->ArgPair(3, 10)
 ->ArgPair(3, 12)
 ->ArgPair(3, 16)
 ->ArgPair(3, 20)
 ->ArgPair(4, 1)
 ->ArgPair(4, 2)
 ->ArgPair(4, 3)
 ->ArgPair(4, 4)
 ->ArgPair(4, 6)
 ->ArgPair(4, 8)
 ->ArgPair(4, 10)
 ->ArgPair(4, 12)
 ->ArgPair(4, 16)
 ->ArgPair(4, 20);

 // Run on (8 X 2200 MHz CPU s)
 // 2018-02-06 21:18:17
 // ------------------------------------------------------------------------------------
 // Benchmark                                             Time           CPU Iterations
 // ------------------------------------------------------------------------------------
 // BM_MatrixTransposeVectorMultiplyDynamic/1/1           5 ns          5 ns  139356174
 // BM_MatrixTransposeVectorMultiplyDynamic/1/2           6 ns          6 ns  120800041
 // BM_MatrixTransposeVectorMultiplyDynamic/1/3           7 ns          7 ns  100267858
 // BM_MatrixTransposeVectorMultiplyDynamic/1/4           9 ns          9 ns   70778564
 // BM_MatrixTransposeVectorMultiplyDynamic/1/6          14 ns         14 ns   47748651
 // BM_MatrixTransposeVectorMultiplyDynamic/1/8          16 ns         16 ns   43903663
 // BM_MatrixTransposeVectorMultiplyDynamic/1/10         18 ns         18 ns   34838177
 // BM_MatrixTransposeVectorMultiplyDynamic/1/12         20 ns         20 ns   36138731
 // BM_MatrixTransposeVectorMultiplyDynamic/1/16         23 ns         23 ns   27063704
 // BM_MatrixTransposeVectorMultiplyDynamic/1/20         29 ns         29 ns   23400336
 // BM_MatrixTransposeVectorMultiplyDynamic/2/1           6 ns          6 ns  121572101
 // BM_MatrixTransposeVectorMultiplyDynamic/2/2           8 ns          8 ns   82896155
 // BM_MatrixTransposeVectorMultiplyDynamic/2/3          12 ns         12 ns   56705415
 // BM_MatrixTransposeVectorMultiplyDynamic/2/4          14 ns         14 ns   51241509
 // BM_MatrixTransposeVectorMultiplyDynamic/2/6          18 ns         18 ns   38377403
 // BM_MatrixTransposeVectorMultiplyDynamic/2/8          25 ns         25 ns   28560121
 // BM_MatrixTransposeVectorMultiplyDynamic/2/10         29 ns         29 ns   23608052
 // BM_MatrixTransposeVectorMultiplyDynamic/2/12         33 ns         33 ns   20668478
 // BM_MatrixTransposeVectorMultiplyDynamic/2/16         44 ns         44 ns   16335446
 // BM_MatrixTransposeVectorMultiplyDynamic/2/20         53 ns         53 ns   13462315
 // BM_MatrixTransposeVectorMultiplyDynamic/3/1           6 ns          6 ns  117031415
 // BM_MatrixTransposeVectorMultiplyDynamic/3/2          10 ns         10 ns   71040747
 // BM_MatrixTransposeVectorMultiplyDynamic/3/3          14 ns         14 ns   49453538
 // BM_MatrixTransposeVectorMultiplyDynamic/3/4          17 ns         17 ns   39161935
 // BM_MatrixTransposeVectorMultiplyDynamic/3/6          22 ns         22 ns   32118490
 // BM_MatrixTransposeVectorMultiplyDynamic/3/8          28 ns         28 ns   25295689
 // BM_MatrixTransposeVectorMultiplyDynamic/3/10         34 ns         34 ns   20900389
 // BM_MatrixTransposeVectorMultiplyDynamic/3/12         39 ns         39 ns   17934922
 // BM_MatrixTransposeVectorMultiplyDynamic/3/16         51 ns         51 ns   10000000
 // BM_MatrixTransposeVectorMultiplyDynamic/3/20         64 ns         64 ns   10594824
 // BM_MatrixTransposeVectorMultiplyDynamic/4/1           7 ns          7 ns   98903583
 // BM_MatrixTransposeVectorMultiplyDynamic/4/2          13 ns         13 ns   57301899
 // BM_MatrixTransposeVectorMultiplyDynamic/4/3          16 ns         16 ns   44622083
 // BM_MatrixTransposeVectorMultiplyDynamic/4/4          18 ns         18 ns   39645007
 // BM_MatrixTransposeVectorMultiplyDynamic/4/6          26 ns         26 ns   27239262
 // BM_MatrixTransposeVectorMultiplyDynamic/4/8          33 ns         33 ns   20869171
 // BM_MatrixTransposeVectorMultiplyDynamic/4/10         39 ns         39 ns   17169614
 // BM_MatrixTransposeVectorMultiplyDynamic/4/12         47 ns         47 ns   15045286
 // BM_MatrixTransposeVectorMultiplyDynamic/4/16         62 ns         62 ns   11437535
 // BM_MatrixTransposeVectorMultiplyDynamic/4/20         77 ns         77 ns    8351428
 void BM_MatrixTransposeVectorMultiplyDynamic(benchmark::State& state) {
   const int rows = state.range(0);
   const int cols = state.range(1);
   MatrixVectorMultiplyData data(rows, cols);
   const std::vector<double*> ptrs = data.ptrs();
   const int num_elements = data.num_elements();

   int i = 0;
   for (auto _ : state) {
     double* a_ptr = ptrs[3 * i];
     double* b_ptr = ptrs[3 * i + 1];
     double* c_ptr = ptrs[3 * i + 2];
     internal::MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
         a_ptr, rows, cols, c_ptr, b_ptr);
     i = (i + 1) % num_elements;
   }
 }

 // Each ArgPair specifies a row and column size of the matrix.
 BENCHMARK(BM_MatrixTransposeVectorMultiplyDynamic)
 ->ArgPair(1, 1)
 ->ArgPair(1, 2)
 ->ArgPair(1, 3)
 ->ArgPair(1, 4)
 ->ArgPair(1, 6)
 ->ArgPair(1, 8)
 ->ArgPair(1, 10)
 ->ArgPair(1, 12)
 ->ArgPair(1, 16)
 ->ArgPair(1, 20)
 ->ArgPair(2, 1)
 ->ArgPair(2, 2)
 ->ArgPair(2, 3)
 ->ArgPair(2, 4)
 ->ArgPair(2, 6)
 ->ArgPair(2, 8)
 ->ArgPair(2, 10)
 ->ArgPair(2, 12)
 ->ArgPair(2, 16)
 ->ArgPair(2, 20)
 ->ArgPair(3, 1)
 ->ArgPair(3, 2)
 ->ArgPair(3, 3)
 ->ArgPair(3, 4)
 ->ArgPair(3, 6)
 ->ArgPair(3, 8)
 ->ArgPair(3, 10)
 ->ArgPair(3, 12)
 ->ArgPair(3, 16)
 ->ArgPair(3, 20)
 ->ArgPair(4, 1)
 ->ArgPair(4, 2)
 ->ArgPair(4, 3)
 ->ArgPair(4, 4)
 ->ArgPair(4, 6)
 ->ArgPair(4, 8)
 ->ArgPair(4, 10)
 ->ArgPair(4, 12)
 ->ArgPair(4, 16)
 ->ArgPair(4, 20);

 }  // namespace ceres

 BENCHMARK_MAIN();
	// Ceres Solver - A fast non-linear least squares minimizer
	// Copyright 2018 Google Inc. All rights reserved.
	// http://ceres-solver.org/
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are met:
	//
	// * Redistributions of source code must retain the above copyright notice,
	// this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above copyright notice,
	// this list of conditions and the following disclaimer in the documentation
	// and/or other materials provided with the distribution.
	// * Neither the name of Google Inc. nor the names of its contributors may be
	// used to endorse or promote products derived from this software without
	// specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.
	//
	// Authors: sameeragarwal@google.com (Sameer Agarwal)

	#include <iostream>
	#include "Eigen/Dense"
	#include "benchmark/benchmark.h"
	#include "ceres/small_blas.h"

	namespace ceres {

	// Benchmarking matrix-vector multiply routines and optimizing memory
	// access requires that we make sure that they are not just sitting in
	// the cache. So, as the benchmarking routine iterates, we need to
	// multiply new/different matrice and vectors. Allocating/creating
	// these objects in the benchmarking loop is too heavy duty, so we
	// create them before hand and cycle through them in the
	// benchmark. This class, given the size of the matrix creates such
	// matrix and vector objects for use in the benchmark.
	class MatrixVectorMultiplyData {
	public:
	MatrixVectorMultiplyData(int rows, int cols) {
	num_elements_ = 1000;
	// A single memory buffer for all the matrices & vectors.

	size_t buffer_size = num_elements_ * (200);
	data_.resize(buffer_size, 1.00000000000001);

	// Each element is three points, corresponding to the three
	// elements of the expression c = A * b.
	ptrs_.resize(3 * num_elements_, NULL);
	double* p = &data_[0];
	for (int i = 0; i < num_elements_; ++i) {
	// Matrix X.
	ptrs_[3 * i] = p;
	p += rows * cols;
	// Vector b.
	ptrs_[3 * i + 1] = p;
	p += cols;
	// Vector c.
	ptrs_[3 * i + 2] = p;
	p += rows;
	}
	}

	int num_elements() const { return num_elements_; }
	double* data() { return &data_[0]; }
	const std::vector<double*>& ptrs() const { return ptrs_; }

	private:
	int num_elements_;
	std::vector<double> data_;
	std::vector<double*> ptrs_;
	};

	// Run on (8 X 2200 MHz CPU s)
	// 2018-02-06 21:23:59
	// ---------------------------------------------------------------------------
	// Benchmark Time CPU Iterations
	// ---------------------------------------------------------------------------
	// BM_MatrixVectorMultiplyDynamic/1/1 4 ns 4 ns 165611093
	// BM_MatrixVectorMultiplyDynamic/1/2 5 ns 5 ns 140648672
	// BM_MatrixVectorMultiplyDynamic/1/3 5 ns 5 ns 139414459
	// BM_MatrixVectorMultiplyDynamic/1/4 5 ns 5 ns 144247512
	// BM_MatrixVectorMultiplyDynamic/1/6 6 ns 6 ns 106639042
	// BM_MatrixVectorMultiplyDynamic/1/8 7 ns 7 ns 102367617
	// BM_MatrixVectorMultiplyDynamic/1/10 9 ns 9 ns 82419847
	// BM_MatrixVectorMultiplyDynamic/1/12 10 ns 10 ns 65129002
	// BM_MatrixVectorMultiplyDynamic/1/16 12 ns 12 ns 53500867
	// BM_MatrixVectorMultiplyDynamic/1/20 16 ns 16 ns 46067179
	// BM_MatrixVectorMultiplyDynamic/2/1 5 ns 5 ns 128880215
	// BM_MatrixVectorMultiplyDynamic/2/2 8 ns 8 ns 81938429
	// BM_MatrixVectorMultiplyDynamic/2/3 10 ns 10 ns 68807565
	// BM_MatrixVectorMultiplyDynamic/2/4 8 ns 8 ns 91833388
	// BM_MatrixVectorMultiplyDynamic/2/6 10 ns 10 ns 64031028
	// BM_MatrixVectorMultiplyDynamic/2/8 12 ns 12 ns 59788179
	// BM_MatrixVectorMultiplyDynamic/2/10 15 ns 15 ns 44737868
	// BM_MatrixVectorMultiplyDynamic/2/12 17 ns 17 ns 37423949
	// BM_MatrixVectorMultiplyDynamic/2/16 22 ns 22 ns 33470723
	// BM_MatrixVectorMultiplyDynamic/2/20 26 ns 26 ns 27076057
	// BM_MatrixVectorMultiplyDynamic/3/1 6 ns 6 ns 100932908
	// BM_MatrixVectorMultiplyDynamic/3/2 12 ns 12 ns 65591589
	// BM_MatrixVectorMultiplyDynamic/3/3 14 ns 14 ns 48182819
	// BM_MatrixVectorMultiplyDynamic/3/4 11 ns 11 ns 61770338
	// BM_MatrixVectorMultiplyDynamic/3/6 15 ns 15 ns 44712435
	// BM_MatrixVectorMultiplyDynamic/3/8 18 ns 18 ns 35177294
	// BM_MatrixVectorMultiplyDynamic/3/10 21 ns 21 ns 32164683
	// BM_MatrixVectorMultiplyDynamic/3/12 24 ns 24 ns 28222279
	// BM_MatrixVectorMultiplyDynamic/3/16 30 ns 30 ns 23050731
	// BM_MatrixVectorMultiplyDynamic/3/20 38 ns 38 ns 17832714
	// BM_MatrixVectorMultiplyDynamic/4/1 8 ns 8 ns 85763293
	// BM_MatrixVectorMultiplyDynamic/4/2 16 ns 16 ns 41959886
	// BM_MatrixVectorMultiplyDynamic/4/3 19 ns 19 ns 36674176
	// BM_MatrixVectorMultiplyDynamic/4/4 15 ns 15 ns 43561867
	// BM_MatrixVectorMultiplyDynamic/4/6 21 ns 21 ns 34278607
	// BM_MatrixVectorMultiplyDynamic/4/8 22 ns 22 ns 31484163
	// BM_MatrixVectorMultiplyDynamic/4/10 26 ns 26 ns 25605197
	// BM_MatrixVectorMultiplyDynamic/4/12 31 ns 31 ns 23380172
	// BM_MatrixVectorMultiplyDynamic/4/16 38 ns 38 ns 18054638
	// BM_MatrixVectorMultiplyDynamic/4/20 49 ns 49 ns 14771703
	void BM_MatrixVectorMultiplyDynamic(benchmark::State& state) {
	const int rows = state.range(0);
	const int cols = state.range(1);
	MatrixVectorMultiplyData data(rows, cols);
	const std::vector<double*> ptrs = data.ptrs();
	const int num_elements = data.num_elements();

	int i = 0;
	for (auto _ : state) {
	double* a_ptr = ptrs[3 * i];
	double* b_ptr = ptrs[3 * i + 1];
	double* c_ptr = ptrs[3 * i + 2];
	internal::MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
	a_ptr, rows, cols, b_ptr, c_ptr);
	i = (i + 1) % num_elements;
	}
	}

	// Each ArgPair specifies a row and column size of the matrix.
	BENCHMARK(BM_MatrixVectorMultiplyDynamic)
	->ArgPair(1, 1)
	->ArgPair(1, 2)
	->ArgPair(1, 3)
	->ArgPair(1, 4)
	->ArgPair(1, 6)
	->ArgPair(1, 8)
	->ArgPair(1, 10)
	->ArgPair(1, 12)
	->ArgPair(1, 16)
	->ArgPair(1, 20)
	->ArgPair(2, 1)
	->ArgPair(2, 2)
	->ArgPair(2, 3)
	->ArgPair(2, 4)
	->ArgPair(2, 6)
	->ArgPair(2, 8)
	->ArgPair(2, 10)
	->ArgPair(2, 12)
	->ArgPair(2, 16)
	->ArgPair(2, 20)
	->ArgPair(3, 1)
	->ArgPair(3, 2)
	->ArgPair(3, 3)
	->ArgPair(3, 4)
	->ArgPair(3, 6)
	->ArgPair(3, 8)
	->ArgPair(3, 10)
	->ArgPair(3, 12)
	->ArgPair(3, 16)
	->ArgPair(3, 20)
	->ArgPair(4, 1)
	->ArgPair(4, 2)
	->ArgPair(4, 3)
	->ArgPair(4, 4)
	->ArgPair(4, 6)
	->ArgPair(4, 8)
	->ArgPair(4, 10)
	->ArgPair(4, 12)
	->ArgPair(4, 16)
	->ArgPair(4, 20);

	// Run on (8 X 2200 MHz CPU s)
	// 2018-02-06 21:18:17
	// ------------------------------------------------------------------------------------
	// Benchmark Time CPU Iterations
	// ------------------------------------------------------------------------------------
	// BM_MatrixTransposeVectorMultiplyDynamic/1/1 5 ns 5 ns 139356174
	// BM_MatrixTransposeVectorMultiplyDynamic/1/2 6 ns 6 ns 120800041
	// BM_MatrixTransposeVectorMultiplyDynamic/1/3 7 ns 7 ns 100267858
	// BM_MatrixTransposeVectorMultiplyDynamic/1/4 9 ns 9 ns 70778564
	// BM_MatrixTransposeVectorMultiplyDynamic/1/6 14 ns 14 ns 47748651
	// BM_MatrixTransposeVectorMultiplyDynamic/1/8 16 ns 16 ns 43903663
	// BM_MatrixTransposeVectorMultiplyDynamic/1/10 18 ns 18 ns 34838177
	// BM_MatrixTransposeVectorMultiplyDynamic/1/12 20 ns 20 ns 36138731
	// BM_MatrixTransposeVectorMultiplyDynamic/1/16 23 ns 23 ns 27063704
	// BM_MatrixTransposeVectorMultiplyDynamic/1/20 29 ns 29 ns 23400336
	// BM_MatrixTransposeVectorMultiplyDynamic/2/1 6 ns 6 ns 121572101
	// BM_MatrixTransposeVectorMultiplyDynamic/2/2 8 ns 8 ns 82896155
	// BM_MatrixTransposeVectorMultiplyDynamic/2/3 12 ns 12 ns 56705415
	// BM_MatrixTransposeVectorMultiplyDynamic/2/4 14 ns 14 ns 51241509
	// BM_MatrixTransposeVectorMultiplyDynamic/2/6 18 ns 18 ns 38377403
	// BM_MatrixTransposeVectorMultiplyDynamic/2/8 25 ns 25 ns 28560121
	// BM_MatrixTransposeVectorMultiplyDynamic/2/10 29 ns 29 ns 23608052
	// BM_MatrixTransposeVectorMultiplyDynamic/2/12 33 ns 33 ns 20668478
	// BM_MatrixTransposeVectorMultiplyDynamic/2/16 44 ns 44 ns 16335446
	// BM_MatrixTransposeVectorMultiplyDynamic/2/20 53 ns 53 ns 13462315
	// BM_MatrixTransposeVectorMultiplyDynamic/3/1 6 ns 6 ns 117031415
	// BM_MatrixTransposeVectorMultiplyDynamic/3/2 10 ns 10 ns 71040747
	// BM_MatrixTransposeVectorMultiplyDynamic/3/3 14 ns 14 ns 49453538
	// BM_MatrixTransposeVectorMultiplyDynamic/3/4 17 ns 17 ns 39161935
	// BM_MatrixTransposeVectorMultiplyDynamic/3/6 22 ns 22 ns 32118490
	// BM_MatrixTransposeVectorMultiplyDynamic/3/8 28 ns 28 ns 25295689
	// BM_MatrixTransposeVectorMultiplyDynamic/3/10 34 ns 34 ns 20900389
	// BM_MatrixTransposeVectorMultiplyDynamic/3/12 39 ns 39 ns 17934922
	// BM_MatrixTransposeVectorMultiplyDynamic/3/16 51 ns 51 ns 10000000
	// BM_MatrixTransposeVectorMultiplyDynamic/3/20 64 ns 64 ns 10594824
	// BM_MatrixTransposeVectorMultiplyDynamic/4/1 7 ns 7 ns 98903583
	// BM_MatrixTransposeVectorMultiplyDynamic/4/2 13 ns 13 ns 57301899
	// BM_MatrixTransposeVectorMultiplyDynamic/4/3 16 ns 16 ns 44622083
	// BM_MatrixTransposeVectorMultiplyDynamic/4/4 18 ns 18 ns 39645007
	// BM_MatrixTransposeVectorMultiplyDynamic/4/6 26 ns 26 ns 27239262
	// BM_MatrixTransposeVectorMultiplyDynamic/4/8 33 ns 33 ns 20869171
	// BM_MatrixTransposeVectorMultiplyDynamic/4/10 39 ns 39 ns 17169614
	// BM_MatrixTransposeVectorMultiplyDynamic/4/12 47 ns 47 ns 15045286
	// BM_MatrixTransposeVectorMultiplyDynamic/4/16 62 ns 62 ns 11437535
	// BM_MatrixTransposeVectorMultiplyDynamic/4/20 77 ns 77 ns 8351428
	void BM_MatrixTransposeVectorMultiplyDynamic(benchmark::State& state) {
	const int rows = state.range(0);
	const int cols = state.range(1);
	MatrixVectorMultiplyData data(rows, cols);
	const std::vector<double*> ptrs = data.ptrs();
	const int num_elements = data.num_elements();

	int i = 0;
	for (auto _ : state) {
	double* a_ptr = ptrs[3 * i];
	double* b_ptr = ptrs[3 * i + 1];
	double* c_ptr = ptrs[3 * i + 2];
	internal::MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
	a_ptr, rows, cols, c_ptr, b_ptr);
	i = (i + 1) % num_elements;
	}
	}

	// Each ArgPair specifies a row and column size of the matrix.
	BENCHMARK(BM_MatrixTransposeVectorMultiplyDynamic)
	->ArgPair(1, 1)
	->ArgPair(1, 2)
	->ArgPair(1, 3)
	->ArgPair(1, 4)
	->ArgPair(1, 6)
	->ArgPair(1, 8)
	->ArgPair(1, 10)
	->ArgPair(1, 12)
	->ArgPair(1, 16)
	->ArgPair(1, 20)
	->ArgPair(2, 1)
	->ArgPair(2, 2)
	->ArgPair(2, 3)
	->ArgPair(2, 4)
	->ArgPair(2, 6)
	->ArgPair(2, 8)
	->ArgPair(2, 10)
	->ArgPair(2, 12)
	->ArgPair(2, 16)
	->ArgPair(2, 20)
	->ArgPair(3, 1)
	->ArgPair(3, 2)
	->ArgPair(3, 3)
	->ArgPair(3, 4)
	->ArgPair(3, 6)
	->ArgPair(3, 8)
	->ArgPair(3, 10)
	->ArgPair(3, 12)
	->ArgPair(3, 16)
	->ArgPair(3, 20)
	->ArgPair(4, 1)
	->ArgPair(4, 2)
	->ArgPair(4, 3)
	->ArgPair(4, 4)
	->ArgPair(4, 6)
	->ArgPair(4, 8)
	->ArgPair(4, 10)
	->ArgPair(4, 12)
	->ArgPair(4, 16)
	->ArgPair(4, 20);

	} // namespace ceres

	BENCHMARK_MAIN();