Refactor small_blas_benchmark.cc

1. Rename it to small_blas_gemv_benchmark.cc to better reflect
   its coverage.
2. Simplify the data generation.
3. Make the two vectors and the matrix in each element live in
   separate arrays to ensure that they are not cache coherent.
4. Use "Apply" instead of ArgPair to simplify and unify the
   matrix sizes.
5. Update the benchmark numbers and move them to a json
   file in the benchmarks directory.

Change-Id: Iaf3764083f902258def739c4be42e0580be103cb
diff --git a/benchmarks/macbook-pro-2014-small_blas_gemv_benchmark.json b/benchmarks/macbook-pro-2014-small_blas_gemv_benchmark.json
new file mode 100644
index 0000000..3e483a2
--- /dev/null
+++ b/benchmarks/macbook-pro-2014-small_blas_gemv_benchmark.json
@@ -0,0 +1,515 @@
+{
+  "context": {
+    "date": "2018-02-25 13:20:04",
+    "num_cpus": 8,
+    "mhz_per_cpu": 2200,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_MatrixVectorMultiply/1/1",
+      "iterations": 69298697,
+      "real_time": 1.0097105894512250e+01,
+      "cpu_time": 1.0040275360444367e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/1/2",
+      "iterations": 65913992,
+      "real_time": 1.0302522610563768e+01,
+      "cpu_time": 1.0245351244998181e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/1/3",
+      "iterations": 73595895,
+      "real_time": 9.6440729347083387e+00,
+      "cpu_time": 9.5853172245544389e+00,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/1/4",
+      "iterations": 69574897,
+      "real_time": 1.0073530211782117e+01,
+      "cpu_time": 9.9997560901886722e+00,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/1/6",
+      "iterations": 58273603,
+      "real_time": 1.1879482448448192e+01,
+      "cpu_time": 1.1746073089045133e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/1/7",
+      "iterations": 54426846,
+      "real_time": 1.2970374490544540e+01,
+      "cpu_time": 1.2881400476522181e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/1/12",
+      "iterations": 40754541,
+      "real_time": 1.7086407941937370e+01,
+      "cpu_time": 1.6969201051730625e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/1/16",
+      "iterations": 36813042,
+      "real_time": 2.0977509004292560e+01,
+      "cpu_time": 2.0338009556504460e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/1/20",
+      "iterations": 32061264,
+      "real_time": 2.1783493596539344e+01,
+      "cpu_time": 2.1625753744456222e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/2/1",
+      "iterations": 71987577,
+      "real_time": 9.9393425620120528e+00,
+      "cpu_time": 9.8528944792793975e+00,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/2/2",
+      "iterations": 63771443,
+      "real_time": 1.1381383373240160e+01,
+      "cpu_time": 1.1294600939169571e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/2/3",
+      "iterations": 52469043,
+      "real_time": 1.5624869525055580e+01,
+      "cpu_time": 1.5158995752981452e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/2/4",
+      "iterations": 45512470,
+      "real_time": 1.5249180278513519e+01,
+      "cpu_time": 1.4811259419671170e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/2/6",
+      "iterations": 40479275,
+      "real_time": 1.7419527499445600e+01,
+      "cpu_time": 1.7267799386229168e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/2/7",
+      "iterations": 35497677,
+      "real_time": 1.9656466957520109e+01,
+      "cpu_time": 1.9009525609239173e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/2/12",
+      "iterations": 27042793,
+      "real_time": 2.4197042293219681e+01,
+      "cpu_time": 2.3855672008434897e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/2/16",
+      "iterations": 24077820,
+      "real_time": 2.8851556123593411e+01,
+      "cpu_time": 2.8603793865059156e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/2/20",
+      "iterations": 20977240,
+      "real_time": 3.2981163630488950e+01,
+      "cpu_time": 3.2740865814568508e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/3/1",
+      "iterations": 61687596,
+      "real_time": 1.1764908379779497e+01,
+      "cpu_time": 1.1687698123298562e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/3/2",
+      "iterations": 43755469,
+      "real_time": 1.6423369476207299e+01,
+      "cpu_time": 1.6309206970218952e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/3/3",
+      "iterations": 37693381,
+      "real_time": 1.8031485450223030e+01,
+      "cpu_time": 1.7915983710774054e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/3/4",
+      "iterations": 36812849,
+      "real_time": 1.9044860858008274e+01,
+      "cpu_time": 1.8915433575923469e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/3/6",
+      "iterations": 31380003,
+      "real_time": 2.2716422938798129e+01,
+      "cpu_time": 2.2573452271499175e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/3/7",
+      "iterations": 29979614,
+      "real_time": 2.5305456738143899e+01,
+      "cpu_time": 2.4368525892294780e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/3/12",
+      "iterations": 22574601,
+      "real_time": 3.2935694633936649e+01,
+      "cpu_time": 3.1806276443158403e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/3/16",
+      "iterations": 18534847,
+      "real_time": 3.7167092181413921e+01,
+      "cpu_time": 3.6952179858835549e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/3/20",
+      "iterations": 14379061,
+      "real_time": 4.9144608746192276e+01,
+      "cpu_time": 4.8366649254773812e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/4/1",
+      "iterations": 48585469,
+      "real_time": 1.4383808851738836e+01,
+      "cpu_time": 1.4028535980582999e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/4/2",
+      "iterations": 36885397,
+      "real_time": 1.9225010159014570e+01,
+      "cpu_time": 1.9118216349955460e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/4/3",
+      "iterations": 31222959,
+      "real_time": 2.2854289371385235e+01,
+      "cpu_time": 2.2695991113462323e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/4/4",
+      "iterations": 31518186,
+      "real_time": 2.2779098836662452e+01,
+      "cpu_time": 2.2609264378349625e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/4/6",
+      "iterations": 27870791,
+      "real_time": 2.6312030431589900e+01,
+      "cpu_time": 2.6164704116219813e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/4/7",
+      "iterations": 26635110,
+      "real_time": 2.8015773282089135e+01,
+      "cpu_time": 2.7610248277555474e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/4/12",
+      "iterations": 17797575,
+      "real_time": 3.9333237811242320e+01,
+      "cpu_time": 3.8665042849938892e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/4/16",
+      "iterations": 14592636,
+      "real_time": 4.8254791185539325e+01,
+      "cpu_time": 4.7936644208764051e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixVectorMultiply/4/20",
+      "iterations": 11582884,
+      "real_time": 6.1053182527591900e+01,
+      "cpu_time": 6.0339031280982766e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/1/1",
+      "iterations": 76783011,
+      "real_time": 9.3956197029191628e+00,
+      "cpu_time": 9.3391232078669315e+00,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/1/2",
+      "iterations": 70815082,
+      "real_time": 9.6104494527203634e+00,
+      "cpu_time": 9.5503808072974952e+00,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/1/3",
+      "iterations": 66582329,
+      "real_time": 1.0727912012247707e+01,
+      "cpu_time": 1.0640345728969692e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/1/4",
+      "iterations": 53793602,
+      "real_time": 1.3614292048217301e+01,
+      "cpu_time": 1.3500583210620450e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/1/6",
+      "iterations": 40321652,
+      "real_time": 1.7766203749914158e+01,
+      "cpu_time": 1.7570882264446855e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/1/7",
+      "iterations": 37201986,
+      "real_time": 1.9978757341790001e+01,
+      "cpu_time": 1.8992776353391505e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/1/12",
+      "iterations": 27434090,
+      "real_time": 2.6018652924849363e+01,
+      "cpu_time": 2.5589622254647360e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/1/16",
+      "iterations": 18985007,
+      "real_time": 3.8675490929263397e+01,
+      "cpu_time": 3.8356793863705086e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/1/20",
+      "iterations": 16250235,
+      "real_time": 4.2684055338167965e+01,
+      "cpu_time": 4.2342464585896622e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/2/1",
+      "iterations": 72217809,
+      "real_time": 9.8704761588567429e+00,
+      "cpu_time": 9.7949108370208933e+00,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/2/2",
+      "iterations": 58405854,
+      "real_time": 1.2449916972022011e+01,
+      "cpu_time": 1.2345851496324279e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/2/3",
+      "iterations": 44559025,
+      "real_time": 1.6004563317294398e+01,
+      "cpu_time": 1.5868367855894519e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/2/4",
+      "iterations": 35586284,
+      "real_time": 1.8873825462252668e+01,
+      "cpu_time": 1.8742417724761761e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/2/6",
+      "iterations": 30603233,
+      "real_time": 2.3746124174457755e+01,
+      "cpu_time": 2.3614759917685756e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/2/7",
+      "iterations": 25503512,
+      "real_time": 2.7041517616016900e+01,
+      "cpu_time": 2.6823050880208051e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/2/12",
+      "iterations": 17142745,
+      "real_time": 4.0841419326141207e+01,
+      "cpu_time": 4.0593790551046581e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/2/16",
+      "iterations": 13289037,
+      "real_time": 5.3266691859451711e+01,
+      "cpu_time": 5.2837914440301496e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/2/20",
+      "iterations": 10179301,
+      "real_time": 7.1242744951979475e+01,
+      "cpu_time": 6.9943702421217523e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/3/1",
+      "iterations": 69897251,
+      "real_time": 1.0455159873353184e+01,
+      "cpu_time": 1.0183404780826111e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/3/2",
+      "iterations": 48687523,
+      "real_time": 1.4473279406174216e+01,
+      "cpu_time": 1.4368362917127623e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/3/3",
+      "iterations": 36199282,
+      "real_time": 1.8768082416320095e+01,
+      "cpu_time": 1.8594457204980039e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/3/4",
+      "iterations": 32392711,
+      "real_time": 2.1793111421538484e+01,
+      "cpu_time": 2.1554787433506387e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/3/6",
+      "iterations": 25449918,
+      "real_time": 2.7307808579096385e+01,
+      "cpu_time": 2.7092071573668875e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/3/7",
+      "iterations": 22970703,
+      "real_time": 3.0684889528257191e+01,
+      "cpu_time": 3.0455576392242047e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/3/12",
+      "iterations": 15189357,
+      "real_time": 4.8938380408733977e+01,
+      "cpu_time": 4.7137676729830986e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/3/16",
+      "iterations": 12187690,
+      "real_time": 6.4202213625161093e+01,
+      "cpu_time": 6.1960141749584857e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/3/20",
+      "iterations": 8626958,
+      "real_time": 7.7660120865570065e+01,
+      "cpu_time": 7.6987740058547374e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/4/1",
+      "iterations": 68685362,
+      "real_time": 1.0764260905433320e+01,
+      "cpu_time": 1.0544197175520464e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/4/2",
+      "iterations": 41396113,
+      "real_time": 1.6881927031100783e+01,
+      "cpu_time": 1.6629145833088174e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/4/3",
+      "iterations": 35592798,
+      "real_time": 2.0520861720163161e+01,
+      "cpu_time": 2.0318998242284707e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/4/4",
+      "iterations": 29245508,
+      "real_time": 2.4356611964112989e+01,
+      "cpu_time": 2.4129585986333215e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/4/6",
+      "iterations": 20036065,
+      "real_time": 3.4106466514085753e+01,
+      "cpu_time": 3.3619725230478345e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/4/7",
+      "iterations": 18768417,
+      "real_time": 3.6661212077921938e+01,
+      "cpu_time": 3.6314090847405801e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/4/12",
+      "iterations": 11789871,
+      "real_time": 6.1986798330774114e+01,
+      "cpu_time": 6.1171322400389727e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/4/16",
+      "iterations": 7869236,
+      "real_time": 8.5527959635890710e+01,
+      "cpu_time": 8.2749964545479799e+01,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_MatrixTransposeVectorMultiply/4/20",
+      "iterations": 7417847,
+      "real_time": 9.9073482499474821e+01,
+      "cpu_time": 9.7701125407411183e+01,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 4080ea7..618bd72 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -383,7 +383,6 @@
   add_executable(autodiff_cost_function_benchmark autodiff_cost_function_benchmark.cc)
   target_link_libraries(autodiff_cost_function_benchmark ceres benchmark::benchmark)
 
-  add_executable(small_blas_benchmark small_blas_benchmark.cc)
-  target_link_libraries(small_blas_benchmark ceres benchmark::benchmark)
-
+  add_executable(small_blas_gemv_benchmark small_blas_gemv_benchmark.cc)
+  target_link_libraries(small_blas_gemv_benchmark ceres benchmark::benchmark)
 endif (BUILD_BENCHMARKS)
diff --git a/internal/ceres/small_blas_benchmark.cc b/internal/ceres/small_blas_benchmark.cc
deleted file mode 100644
index c78a69d..0000000
--- a/internal/ceres/small_blas_benchmark.cc
+++ /dev/null
@@ -1,296 +0,0 @@
-// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2018 Google Inc. All rights reserved.
-// http://ceres-solver.org/
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// * Neither the name of Google Inc. nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: sameeragarwal@google.com (Sameer Agarwal)
-
-#include <iostream>
-#include "Eigen/Dense"
-#include "benchmark/benchmark.h"
-#include "ceres/small_blas.h"
-
-namespace ceres {
-
-// Benchmarking matrix-vector multiply routines and optimizing memory
-// access requires that we make sure that they are not just sitting in
-// the cache. So, as the benchmarking routine iterates, we need to
-// multiply new/different matrice and vectors. Allocating/creating
-// these objects in the benchmarking loop is too heavy duty, so we
-// create them before hand and cycle through them in the
-// benchmark. This class, given the size of the matrix creates such
-// matrix and vector objects for use in the benchmark.
-class MatrixVectorMultiplyData {
- public:
-  MatrixVectorMultiplyData(int rows, int cols) {
-    num_elements_ = 1000;
-    // A single memory buffer for all the matrices & vectors.
-
-    size_t buffer_size = num_elements_ * (200);
-    data_.resize(buffer_size, 1.00000000000001);
-
-    // Each element is three points, corresponding to the three
-    // elements of the expression c = A * b.
-    ptrs_.resize(3 * num_elements_, NULL);
-    double* p = &data_[0];
-    for (int i = 0; i < num_elements_; ++i) {
-      // Matrix X.
-      ptrs_[3 * i] = p;
-      p += rows * cols;
-      // Vector b.
-      ptrs_[3 * i + 1] = p;
-      p += cols;
-      // Vector c.
-      ptrs_[3 * i + 2] = p;
-      p += rows;
-    }
-  }
-
-  int num_elements() const { return num_elements_; }
-  double* data() { return &data_[0]; }
-  const std::vector<double*>& ptrs() const { return ptrs_; }
-
- private:
-  int num_elements_;
-  std::vector<double> data_;
-  std::vector<double*> ptrs_;
-};
-
-// Run on (8 X 2200 MHz CPU s)
-// 2018-02-06 21:23:59
-// ---------------------------------------------------------------------------
-// Benchmark                                    Time           CPU Iterations
-// ---------------------------------------------------------------------------
-// BM_MatrixVectorMultiplyDynamic/1/1           4 ns          4 ns  165611093
-// BM_MatrixVectorMultiplyDynamic/1/2           5 ns          5 ns  140648672
-// BM_MatrixVectorMultiplyDynamic/1/3           5 ns          5 ns  139414459
-// BM_MatrixVectorMultiplyDynamic/1/4           5 ns          5 ns  144247512
-// BM_MatrixVectorMultiplyDynamic/1/6           6 ns          6 ns  106639042
-// BM_MatrixVectorMultiplyDynamic/1/8           7 ns          7 ns  102367617
-// BM_MatrixVectorMultiplyDynamic/1/10          9 ns          9 ns   82419847
-// BM_MatrixVectorMultiplyDynamic/1/12         10 ns         10 ns   65129002
-// BM_MatrixVectorMultiplyDynamic/1/16         12 ns         12 ns   53500867
-// BM_MatrixVectorMultiplyDynamic/1/20         16 ns         16 ns   46067179
-// BM_MatrixVectorMultiplyDynamic/2/1           5 ns          5 ns  128880215
-// BM_MatrixVectorMultiplyDynamic/2/2           8 ns          8 ns   81938429
-// BM_MatrixVectorMultiplyDynamic/2/3          10 ns         10 ns   68807565
-// BM_MatrixVectorMultiplyDynamic/2/4           8 ns          8 ns   91833388
-// BM_MatrixVectorMultiplyDynamic/2/6          10 ns         10 ns   64031028
-// BM_MatrixVectorMultiplyDynamic/2/8          12 ns         12 ns   59788179
-// BM_MatrixVectorMultiplyDynamic/2/10         15 ns         15 ns   44737868
-// BM_MatrixVectorMultiplyDynamic/2/12         17 ns         17 ns   37423949
-// BM_MatrixVectorMultiplyDynamic/2/16         22 ns         22 ns   33470723
-// BM_MatrixVectorMultiplyDynamic/2/20         26 ns         26 ns   27076057
-// BM_MatrixVectorMultiplyDynamic/3/1           6 ns          6 ns  100932908
-// BM_MatrixVectorMultiplyDynamic/3/2          12 ns         12 ns   65591589
-// BM_MatrixVectorMultiplyDynamic/3/3          14 ns         14 ns   48182819
-// BM_MatrixVectorMultiplyDynamic/3/4          11 ns         11 ns   61770338
-// BM_MatrixVectorMultiplyDynamic/3/6          15 ns         15 ns   44712435
-// BM_MatrixVectorMultiplyDynamic/3/8          18 ns         18 ns   35177294
-// BM_MatrixVectorMultiplyDynamic/3/10         21 ns         21 ns   32164683
-// BM_MatrixVectorMultiplyDynamic/3/12         24 ns         24 ns   28222279
-// BM_MatrixVectorMultiplyDynamic/3/16         30 ns         30 ns   23050731
-// BM_MatrixVectorMultiplyDynamic/3/20         38 ns         38 ns   17832714
-// BM_MatrixVectorMultiplyDynamic/4/1           8 ns          8 ns   85763293
-// BM_MatrixVectorMultiplyDynamic/4/2          16 ns         16 ns   41959886
-// BM_MatrixVectorMultiplyDynamic/4/3          19 ns         19 ns   36674176
-// BM_MatrixVectorMultiplyDynamic/4/4          15 ns         15 ns   43561867
-// BM_MatrixVectorMultiplyDynamic/4/6          21 ns         21 ns   34278607
-// BM_MatrixVectorMultiplyDynamic/4/8          22 ns         22 ns   31484163
-// BM_MatrixVectorMultiplyDynamic/4/10         26 ns         26 ns   25605197
-// BM_MatrixVectorMultiplyDynamic/4/12         31 ns         31 ns   23380172
-// BM_MatrixVectorMultiplyDynamic/4/16         38 ns         38 ns   18054638
-// BM_MatrixVectorMultiplyDynamic/4/20         49 ns         49 ns   14771703
-void BM_MatrixVectorMultiplyDynamic(benchmark::State& state) {
-  const int rows = state.range(0);
-  const int cols = state.range(1);
-  MatrixVectorMultiplyData data(rows, cols);
-  const std::vector<double*> ptrs = data.ptrs();
-  const int num_elements = data.num_elements();
-
-  int i = 0;
-  for (auto _ : state) {
-    double* a_ptr = ptrs[3 * i];
-    double* b_ptr = ptrs[3 * i + 1];
-    double* c_ptr = ptrs[3 * i + 2];
-    internal::MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
-        a_ptr, rows, cols, b_ptr, c_ptr);
-    i = (i + 1) % num_elements;
-  }
-}
-
-// Each ArgPair specifies a row and column size of the matrix.
-BENCHMARK(BM_MatrixVectorMultiplyDynamic)
-->ArgPair(1, 1)
-->ArgPair(1, 2)
-->ArgPair(1, 3)
-->ArgPair(1, 4)
-->ArgPair(1, 6)
-->ArgPair(1, 8)
-->ArgPair(1, 10)
-->ArgPair(1, 12)
-->ArgPair(1, 16)
-->ArgPair(1, 20)
-->ArgPair(2, 1)
-->ArgPair(2, 2)
-->ArgPair(2, 3)
-->ArgPair(2, 4)
-->ArgPair(2, 6)
-->ArgPair(2, 8)
-->ArgPair(2, 10)
-->ArgPair(2, 12)
-->ArgPair(2, 16)
-->ArgPair(2, 20)
-->ArgPair(3, 1)
-->ArgPair(3, 2)
-->ArgPair(3, 3)
-->ArgPair(3, 4)
-->ArgPair(3, 6)
-->ArgPair(3, 8)
-->ArgPair(3, 10)
-->ArgPair(3, 12)
-->ArgPair(3, 16)
-->ArgPair(3, 20)
-->ArgPair(4, 1)
-->ArgPair(4, 2)
-->ArgPair(4, 3)
-->ArgPair(4, 4)
-->ArgPair(4, 6)
-->ArgPair(4, 8)
-->ArgPair(4, 10)
-->ArgPair(4, 12)
-->ArgPair(4, 16)
-->ArgPair(4, 20);
-
-// Run on (8 X 2200 MHz CPU s)
-// 2018-02-06 21:18:17
-// ------------------------------------------------------------------------------------
-// Benchmark                                             Time           CPU Iterations
-// ------------------------------------------------------------------------------------
-// BM_MatrixTransposeVectorMultiplyDynamic/1/1           5 ns          5 ns  139356174
-// BM_MatrixTransposeVectorMultiplyDynamic/1/2           6 ns          6 ns  120800041
-// BM_MatrixTransposeVectorMultiplyDynamic/1/3           7 ns          7 ns  100267858
-// BM_MatrixTransposeVectorMultiplyDynamic/1/4           9 ns          9 ns   70778564
-// BM_MatrixTransposeVectorMultiplyDynamic/1/6          14 ns         14 ns   47748651
-// BM_MatrixTransposeVectorMultiplyDynamic/1/8          16 ns         16 ns   43903663
-// BM_MatrixTransposeVectorMultiplyDynamic/1/10         18 ns         18 ns   34838177
-// BM_MatrixTransposeVectorMultiplyDynamic/1/12         20 ns         20 ns   36138731
-// BM_MatrixTransposeVectorMultiplyDynamic/1/16         23 ns         23 ns   27063704
-// BM_MatrixTransposeVectorMultiplyDynamic/1/20         29 ns         29 ns   23400336
-// BM_MatrixTransposeVectorMultiplyDynamic/2/1           6 ns          6 ns  121572101
-// BM_MatrixTransposeVectorMultiplyDynamic/2/2           8 ns          8 ns   82896155
-// BM_MatrixTransposeVectorMultiplyDynamic/2/3          12 ns         12 ns   56705415
-// BM_MatrixTransposeVectorMultiplyDynamic/2/4          14 ns         14 ns   51241509
-// BM_MatrixTransposeVectorMultiplyDynamic/2/6          18 ns         18 ns   38377403
-// BM_MatrixTransposeVectorMultiplyDynamic/2/8          25 ns         25 ns   28560121
-// BM_MatrixTransposeVectorMultiplyDynamic/2/10         29 ns         29 ns   23608052
-// BM_MatrixTransposeVectorMultiplyDynamic/2/12         33 ns         33 ns   20668478
-// BM_MatrixTransposeVectorMultiplyDynamic/2/16         44 ns         44 ns   16335446
-// BM_MatrixTransposeVectorMultiplyDynamic/2/20         53 ns         53 ns   13462315
-// BM_MatrixTransposeVectorMultiplyDynamic/3/1           6 ns          6 ns  117031415
-// BM_MatrixTransposeVectorMultiplyDynamic/3/2          10 ns         10 ns   71040747
-// BM_MatrixTransposeVectorMultiplyDynamic/3/3          14 ns         14 ns   49453538
-// BM_MatrixTransposeVectorMultiplyDynamic/3/4          17 ns         17 ns   39161935
-// BM_MatrixTransposeVectorMultiplyDynamic/3/6          22 ns         22 ns   32118490
-// BM_MatrixTransposeVectorMultiplyDynamic/3/8          28 ns         28 ns   25295689
-// BM_MatrixTransposeVectorMultiplyDynamic/3/10         34 ns         34 ns   20900389
-// BM_MatrixTransposeVectorMultiplyDynamic/3/12         39 ns         39 ns   17934922
-// BM_MatrixTransposeVectorMultiplyDynamic/3/16         51 ns         51 ns   10000000
-// BM_MatrixTransposeVectorMultiplyDynamic/3/20         64 ns         64 ns   10594824
-// BM_MatrixTransposeVectorMultiplyDynamic/4/1           7 ns          7 ns   98903583
-// BM_MatrixTransposeVectorMultiplyDynamic/4/2          13 ns         13 ns   57301899
-// BM_MatrixTransposeVectorMultiplyDynamic/4/3          16 ns         16 ns   44622083
-// BM_MatrixTransposeVectorMultiplyDynamic/4/4          18 ns         18 ns   39645007
-// BM_MatrixTransposeVectorMultiplyDynamic/4/6          26 ns         26 ns   27239262
-// BM_MatrixTransposeVectorMultiplyDynamic/4/8          33 ns         33 ns   20869171
-// BM_MatrixTransposeVectorMultiplyDynamic/4/10         39 ns         39 ns   17169614
-// BM_MatrixTransposeVectorMultiplyDynamic/4/12         47 ns         47 ns   15045286
-// BM_MatrixTransposeVectorMultiplyDynamic/4/16         62 ns         62 ns   11437535
-// BM_MatrixTransposeVectorMultiplyDynamic/4/20         77 ns         77 ns    8351428
-void BM_MatrixTransposeVectorMultiplyDynamic(benchmark::State& state) {
-  const int rows = state.range(0);
-  const int cols = state.range(1);
-  MatrixVectorMultiplyData data(rows, cols);
-  const std::vector<double*> ptrs = data.ptrs();
-  const int num_elements = data.num_elements();
-
-  int i = 0;
-  for (auto _ : state) {
-    double* a_ptr = ptrs[3 * i];
-    double* b_ptr = ptrs[3 * i + 1];
-    double* c_ptr = ptrs[3 * i + 2];
-    internal::MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
-        a_ptr, rows, cols, c_ptr, b_ptr);
-    i = (i + 1) % num_elements;
-  }
-}
-
-// Each ArgPair specifies a row and column size of the matrix.
-BENCHMARK(BM_MatrixTransposeVectorMultiplyDynamic)
-->ArgPair(1, 1)
-->ArgPair(1, 2)
-->ArgPair(1, 3)
-->ArgPair(1, 4)
-->ArgPair(1, 6)
-->ArgPair(1, 8)
-->ArgPair(1, 10)
-->ArgPair(1, 12)
-->ArgPair(1, 16)
-->ArgPair(1, 20)
-->ArgPair(2, 1)
-->ArgPair(2, 2)
-->ArgPair(2, 3)
-->ArgPair(2, 4)
-->ArgPair(2, 6)
-->ArgPair(2, 8)
-->ArgPair(2, 10)
-->ArgPair(2, 12)
-->ArgPair(2, 16)
-->ArgPair(2, 20)
-->ArgPair(3, 1)
-->ArgPair(3, 2)
-->ArgPair(3, 3)
-->ArgPair(3, 4)
-->ArgPair(3, 6)
-->ArgPair(3, 8)
-->ArgPair(3, 10)
-->ArgPair(3, 12)
-->ArgPair(3, 16)
-->ArgPair(3, 20)
-->ArgPair(4, 1)
-->ArgPair(4, 2)
-->ArgPair(4, 3)
-->ArgPair(4, 4)
-->ArgPair(4, 6)
-->ArgPair(4, 8)
-->ArgPair(4, 10)
-->ArgPair(4, 12)
-->ArgPair(4, 16)
-->ArgPair(4, 20);
-
-}  // namespace ceres
-
-BENCHMARK_MAIN();
diff --git a/internal/ceres/small_blas_gemv_benchmark.cc b/internal/ceres/small_blas_gemv_benchmark.cc
new file mode 100644
index 0000000..513ff12
--- /dev/null
+++ b/internal/ceres/small_blas_gemv_benchmark.cc
@@ -0,0 +1,116 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2018 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: sameeragarwal@google.com (Sameer Agarwal)
+
+#include "Eigen/Dense"
+#include "benchmark/benchmark.h"
+#include "ceres/small_blas.h"
+
+namespace ceres {
+
+// Benchmarking matrix-vector multiply routines and optimizing memory
+// access requires that we make sure that they are not just sitting in
+// the cache. So, as the benchmarking routine iterates, we need to
+// multiply new/different matrice and vectors. Allocating/creating
+// these objects in the benchmarking loop is too heavy duty, so we
+// create them before hand and cycle through them in the
+// benchmark. This class, given the size of the matrix creates such
+// matrix and vector objects for use in the benchmark.
+class MatrixVectorMultiplyData {
+ public:
+  MatrixVectorMultiplyData(int rows, int cols) {
+    rows_ = rows;
+    cols_ = cols;
+
+    num_elements_ = 1000;
+    a_.resize(num_elements_ * rows, 1.00001);
+    b_.resize(num_elements_ * rows * cols, 1.00002);
+    c_.resize(num_elements_ * cols, 1.00003);
+  }
+
+  int num_elements() const { return num_elements_; }
+  double* GetA(int i) { return &a_[i * rows_]; };
+  double* GetB(int i) { return &b_[i * rows_ * cols_]; };
+  double* GetC(int i) { return &c_[i * cols_]; };
+
+ private:
+  int num_elements_;
+  int rows_;
+  int cols_;
+  std::vector<double> a_;
+  std::vector<double> b_;
+  std::vector<double> c_;
+};
+
+// Helper function to generate the various matrix sizes for which we
+// run the benchmark.
+static void MatrixSizeArguments(benchmark::internal::Benchmark* benchmark) {
+  std::vector<int> rows = {1, 2, 3, 4};
+  std::vector<int> cols = {1, 2, 3, 4, 6, 7, 12, 16, 20};
+  for (int r : rows) {
+    for (int c : cols) {
+      benchmark->Args({r, c});
+    }
+  }
+}
+
+void BM_MatrixVectorMultiply(benchmark::State& state) {
+  const int rows = state.range(0);
+  const int cols = state.range(1);
+  MatrixVectorMultiplyData data(rows, cols);
+  const int num_elements = data.num_elements();
+  int i = 0;
+  for (auto _ : state) {
+    // A += B * C;
+    internal::MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+        data.GetB(i), rows, cols, data.GetC(i), data.GetA(i));
+    i = (i + 1) % num_elements;
+  }
+}
+
+BENCHMARK(BM_MatrixVectorMultiply)->Apply(MatrixSizeArguments);
+
+void BM_MatrixTransposeVectorMultiply(benchmark::State& state) {
+  const int rows = state.range(0);
+  const int cols = state.range(1);
+  MatrixVectorMultiplyData data(cols, rows);
+  const int num_elements = data.num_elements();
+  int i = 0;
+  for (auto _ : state) {
+    internal::MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+        data.GetB(i), rows, cols, data.GetC(i), data.GetA(i));
+    i = (i + 1) % num_elements;
+  }
+}
+
+BENCHMARK(BM_MatrixTransposeVectorMultiply)->Apply(MatrixSizeArguments);
+
+}  // namespace ceres
+
+BENCHMARK_MAIN();