Refactor small_blas_gemm_benchmark

This allows benchmarking both dynamic and static problem sizes.

On my X86 2Ghz machine:

CPU Caches:
  L1 Data 32 KiB (x48)
  L1 Instruction 32 KiB (x48)
  L2 Unified 1024 KiB (x48)
  L3 Unified 39424 KiB (x2)

----------------------------------------------------------------------------------------------
Benchmark                                                    Time             CPU   Iterations
----------------------------------------------------------------------------------------------
BM_MatrixMatrixMultiplyEigen_Static_2x3x4                 10.9 ns         10.9 ns     63960496
BM_MatrixMatrixMultiplyEigen_Static_3x3x3                 11.9 ns         11.9 ns     58093785
BM_MatrixMatrixMultiplyEigen_Static_4x4x4                 24.9 ns         24.9 ns     28540133
BM_MatrixMatrixMultiplyEigen_Static_8x8x8                  258 ns          258 ns      2755921
BM_MatrixMatrixMultiplyEigen_Static_9x9x3                  255 ns          255 ns      2778628
BM_MatrixMatrixMultiplyEigen_Static_9x3x3                 37.7 ns         37.7 ns     18556359
BM_MatrixMatrixMultiplyEigen_Static_3x9x9                  220 ns          220 ns      3195163
BM_MatrixMatrixMultiplyEigen_Dynamic_2x3x4                37.8 ns         37.7 ns     18491162
BM_MatrixMatrixMultiplyEigen_Dynamic_3x3x3                48.8 ns         48.8 ns     13724884
BM_MatrixMatrixMultiplyEigen_Dynamic_4x4x4                74.5 ns         74.5 ns      9313146
BM_MatrixMatrixMultiplyEigen_Dynamic_8x8x8                 271 ns          271 ns      2595807
BM_MatrixMatrixMultiplyEigen_Dynamic_9x9x3                 259 ns          259 ns      2688515
BM_MatrixMatrixMultiplyEigen_Dynamic_9x3x3                 123 ns          123 ns      5792115
BM_MatrixMatrixMultiplyEigen_Dynamic_3x9x9                 236 ns          236 ns      2963896
BM_MatrixMatrixMultiplyNaive_Static_2x3x4                 12.2 ns         12.2 ns     56472772
BM_MatrixMatrixMultiplyNaive_Static_3x3x3                 15.5 ns         15.5 ns     44346456
BM_MatrixMatrixMultiplyNaive_Static_4x4x4                 41.5 ns         41.5 ns     17196984
BM_MatrixMatrixMultiplyNaive_Static_8x8x8                  199 ns          199 ns      3561730
BM_MatrixMatrixMultiplyNaive_Static_9x9x3                  148 ns          148 ns      4764814
BM_MatrixMatrixMultiplyNaive_Static_9x3x3                 38.4 ns         38.4 ns     17259019
BM_MatrixMatrixMultiplyNaive_Static_3x9x9                  115 ns          115 ns      6104752
BM_MatrixMatrixMultiplyNaive_Dynamic_2x3x4                9.66 ns         9.66 ns     74722971
BM_MatrixMatrixMultiplyNaive_Dynamic_3x3x3                13.0 ns         13.0 ns     53435308
BM_MatrixMatrixMultiplyNaive_Dynamic_4x4x4                47.8 ns         47.8 ns     14358184
BM_MatrixMatrixMultiplyNaive_Dynamic_8x8x8                 200 ns          200 ns      3572809
BM_MatrixMatrixMultiplyNaive_Dynamic_9x9x3                 104 ns          104 ns      6793797
BM_MatrixMatrixMultiplyNaive_Dynamic_9x3x3                34.0 ns         34.0 ns     20790695
BM_MatrixMatrixMultiplyNaive_Dynamic_3x9x9                 130 ns          130 ns      5170402
BM_MatrixTransposeMatrixMultiplyEigen_Static_2x3x4        10.3 ns         10.3 ns     69105234
BM_MatrixTransposeMatrixMultiplyEigen_Static_3x3x3        28.9 ns         28.9 ns     24478934
BM_MatrixTransposeMatrixMultiplyEigen_Static_4x4x4        23.7 ns         23.7 ns     29351926
BM_MatrixTransposeMatrixMultiplyEigen_Static_8x8x8         233 ns          233 ns      2929398
BM_MatrixTransposeMatrixMultiplyEigen_Static_9x9x3         211 ns          211 ns      3287409
BM_MatrixTransposeMatrixMultiplyEigen_Static_9x3x3        26.5 ns         26.5 ns     26515136
BM_MatrixTransposeMatrixMultiplyEigen_Static_3x9x9         196 ns          196 ns      3594314
BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_2x3x4       9.05 ns         9.05 ns     77621001
BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_3x3x3       11.1 ns         11.1 ns     62227812
BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_4x4x4       25.5 ns         25.5 ns     27356089
BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_8x8x8        248 ns          248 ns      2834983
BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_9x9x3        229 ns          229 ns      3082369
BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_9x3x3       28.4 ns         28.4 ns     24318629
BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_3x9x9        229 ns          229 ns      3091288
BM_MatrixTransposeMatrixMultiplyNaive_Static_2x3x4        11.0 ns         11.0 ns     63773538
BM_MatrixTransposeMatrixMultiplyNaive_Static_3x3x3        19.1 ns         19.1 ns     37003139
BM_MatrixTransposeMatrixMultiplyNaive_Static_4x4x4        49.1 ns         49.1 ns     14142301
BM_MatrixTransposeMatrixMultiplyNaive_Static_8x8x8         244 ns          244 ns      2874755
BM_MatrixTransposeMatrixMultiplyNaive_Static_9x9x3         140 ns          140 ns      4992156
BM_MatrixTransposeMatrixMultiplyNaive_Static_9x3x3        46.2 ns         46.2 ns     15068317
BM_MatrixTransposeMatrixMultiplyNaive_Static_3x9x9         112 ns          112 ns      6213574
BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_2x3x4       9.74 ns         9.74 ns     72155001
BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_3x3x3       11.5 ns         11.5 ns     60070577
BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_4x4x4       52.5 ns         52.5 ns     13473642
BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_8x8x8        224 ns          224 ns      3124264
BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_9x9x3       98.0 ns         98.0 ns      7199292
BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_9x3x3       34.7 ns         34.6 ns     20203685
BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_3x9x9        105 ns          105 ns      6653151

Change-Id: Iee403b4d27801d1614ecc1f78a1f8c0011514bd7
diff --git a/internal/ceres/small_blas_gemm_benchmark.cc b/internal/ceres/small_blas_gemm_benchmark.cc
index aa6c41d..5248471 100644
--- a/internal/ceres/small_blas_gemm_benchmark.cc
+++ b/internal/ceres/small_blas_gemm_benchmark.cc
@@ -47,10 +47,10 @@
 // benchmark.
 class MatrixMatrixMultiplyData {
  public:
-  MatrixMatrixMultiplyData(
-      int a_rows, int a_cols, int b_rows, int b_cols, int c_rows, int c_cols)
+  MatrixMatrixMultiplyData(int a_rows, int a_cols, int b_rows, int b_cols,
+                           int c_rows, int c_cols)
       : num_elements_(1000),
-        a_size_(a_rows * a_cols),
+        a_size_(num_elements_ * a_rows * a_cols),
         b_size_(b_rows * b_cols),
         c_size_(c_rows * c_cols),
         a_(num_elements_ * a_size_, 1.00001),
@@ -72,97 +72,130 @@
   std::vector<double> c_;
 };
 
-static void MatrixMatrixMultiplySizeArguments(
-    benchmark::internal::Benchmark* benchmark) {
-  const std::vector<int> b_rows = {1, 2, 3, 4, 6, 8};
-  const std::vector<int> b_cols = {1, 2, 3, 4, 8, 12, 15};
-  const std::vector<int> c_cols = b_cols;
-  for (int i : b_rows) {
-    for (int j : b_cols) {
-      for (int k : c_cols) {
-        benchmark->Args({i, j, k});
-      }
-    }
-  }
-}
+#define GEMM_KIND_EQ 0
+#define GEMM_KIND_ADD 1
+#define GEMM_KIND_SUB -1
 
-void BM_MatrixMatrixMultiplyDynamic(benchmark::State& state) {
-  const int i = state.range(0);
-  const int j = state.range(1);
-  const int k = state.range(2);
+#define BENCHMARK_MM_FN(FN, M, N, K, NAME, MT, NT, KT)                      \
+  void static BM_##FN##_##NAME##_##M##x##N##x##K(benchmark::State& state) { \
+    const int b_rows = M;                                                   \
+    const int b_cols = N;                                                   \
+    const int c_rows = b_cols;                                              \
+    const int c_cols = K;                                                   \
+    const int a_rows = b_rows;                                              \
+    const int a_cols = c_cols;                                              \
+    MatrixMatrixMultiplyData data(a_rows, a_cols, b_rows, b_cols, c_rows,   \
+                                  c_cols);                                  \
+    const int num_elements = data.num_elements();                           \
+    int iter = 0;                                                           \
+    for (auto _ : state) {                                                  \
+      FN<MT, KT, KT, NT, GEMM_KIND_ADD>(                                    \
+          data.GetB(iter), b_rows, b_cols, data.GetC(iter), c_rows, c_cols, \
+          data.GetA(iter), 512, 512, a_rows, a_cols);                       \
+      iter = (iter + 1) % num_elements;                                     \
+    }                                                                       \
+  }                                                                         \
+  BENCHMARK(BM_##FN##_##NAME##_##M##x##N##x##K);
 
-  const int b_rows = i;
-  const int b_cols = j;
-  const int c_rows = b_cols;
-  const int c_cols = k;
-  const int a_rows = b_rows;
-  const int a_cols = c_cols;
+#define BENCHMARK_STATIC_MM_FN(FN, M, N, K) \
+  BENCHMARK_MM_FN(FN, M, N, K, Static, M, N, K)
+#define BENCHMARK_DYNAMIC_MM_FN(FN, M, N, K)                            \
+  BENCHMARK_MM_FN(FN, M, N, K, Dynamic, Eigen::Dynamic, Eigen::Dynamic, \
+                  Eigen::Dynamic)
 
-  MatrixMatrixMultiplyData data(a_rows, a_cols, b_rows, b_cols, c_rows, c_cols);
-  const int num_elements = data.num_elements();
+#define BENCHMARK_MTM_FN(FN, M, N, K, NAME, MT, NT, KT)                     \
+  void static BM_##FN##_##NAME##_##M##x##N##x##K(benchmark::State& state) { \
+    const int b_rows = M;                                                   \
+    const int b_cols = N;                                                   \
+    const int c_rows = b_rows;                                              \
+    const int c_cols = K;                                                   \
+    const int a_rows = b_cols;                                              \
+    const int a_cols = c_cols;                                              \
+    MatrixMatrixMultiplyData data(a_rows, a_cols, b_rows, b_cols, c_rows,   \
+                                  c_cols);                                  \
+    const int num_elements = data.num_elements();                           \
+    int iter = 0;                                                           \
+    for (auto _ : state) {                                                  \
+      FN<KT, MT, KT, NT, GEMM_KIND_ADD>(                                    \
+          data.GetB(iter), b_rows, b_cols, data.GetC(iter), c_rows, c_cols, \
+          data.GetA(iter), 0, 0, a_rows, a_cols);                           \
+      iter = (iter + 1) % num_elements;                                     \
+    }                                                                       \
+  }                                                                         \
+  BENCHMARK(BM_##FN##_##NAME##_##M##x##N##x##K);
 
-  int iter = 0;
-  for (auto _ : state) {
-    // a += b * c
-    // clang-format off
-    MatrixMatrixMultiply
-        <Eigen::Dynamic, Eigen::Dynamic,Eigen::Dynamic,Eigen::Dynamic, 1>
-        (data.GetB(iter), b_rows, b_cols,
-         data.GetC(iter), c_rows, c_cols,
-         data.GetA(iter), 0, 0, a_rows, a_cols);
-    // clang-format on
-    iter = (iter + 1) % num_elements;
-  }
-}
+#define BENCHMARK_STATIC_MMT_FN(FN, M, N, K) \
+  BENCHMARK_MTM_FN(FN, M, N, K, Static, M, N, K)
+#define BENCHMARK_DYNAMIC_MMT_FN(FN, M, N, K)                            \
+  BENCHMARK_MTM_FN(FN, M, N, K, Dynamic, Eigen::Dynamic, Eigen::Dynamic, \
+                   Eigen::Dynamic)
 
-BENCHMARK(BM_MatrixMatrixMultiplyDynamic)
-    ->Apply(MatrixMatrixMultiplySizeArguments);
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 2, 3, 4)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 3, 3, 3)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 4, 4, 4)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 8, 8, 8)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 9, 9, 3)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 9, 3, 3)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 3, 9, 9)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 2, 3, 4)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 3, 3, 3)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 4, 4, 4)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 8, 8, 8)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 9, 9, 3)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 9, 3, 3)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 3, 9, 9)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 2, 3, 4)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 3, 3, 3)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 4, 4, 4)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 8, 8, 8)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 9, 9, 3)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 9, 3, 3)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 3, 9, 9)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 2, 3, 4)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 3, 3, 3)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 4, 4, 4)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 8, 8, 8)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 9, 9, 3)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 9, 3, 3)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 3, 9, 9)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 2, 3, 4)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 3, 3, 3)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 4, 4, 4)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 8, 8, 8)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 9, 9, 3)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 9, 3, 3)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 3, 9, 9)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 2, 3, 4)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 3, 3, 3)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 4, 4, 4)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 8, 8, 8)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 9, 9, 3)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 9, 3, 3)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 3, 9, 9)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 2, 3, 4)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 3, 3, 3)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 4, 4, 4)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 8, 8, 8)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 9, 9, 3)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 9, 3, 3)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 3, 9, 9)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 2, 3, 4)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 3, 3, 3)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 4, 4, 4)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 8, 8, 8)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 9, 9, 3)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 9, 3, 3)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 3, 9, 9)
 
-static void MatrixTransposeMatrixMultiplySizeArguments(
-    benchmark::internal::Benchmark* benchmark) {
-  std::vector<int> b_rows = {1, 2, 3, 4, 6, 8};
-  std::vector<int> b_cols = {1, 2, 3, 4, 8, 12, 15};
-  std::vector<int> c_cols = b_rows;
-  for (int i : b_rows) {
-    for (int j : b_cols) {
-      for (int k : c_cols) {
-        benchmark->Args({i, j, k});
-      }
-    }
-  }
-}
-
-void BM_MatrixTransposeMatrixMultiplyDynamic(benchmark::State& state) {
-  const int i = state.range(0);
-  const int j = state.range(1);
-  const int k = state.range(2);
-
-  const int b_rows = i;
-  const int b_cols = j;
-  const int c_rows = b_rows;
-  const int c_cols = k;
-  const int a_rows = b_cols;
-  const int a_cols = c_cols;
-
-  MatrixMatrixMultiplyData data(a_rows, a_cols, b_rows, b_cols, c_rows, c_cols);
-  const int num_elements = data.num_elements();
-
-  int iter = 0;
-  for (auto _ : state) {
-    // a += b' * c
-    // clang-format off
-    MatrixTransposeMatrixMultiply
-        <Eigen::Dynamic,Eigen::Dynamic,Eigen::Dynamic,Eigen::Dynamic, 1>
-        (data.GetB(iter), b_rows, b_cols,
-         data.GetC(iter), c_rows, c_cols,
-         data.GetA(iter), 0, 0, a_rows, a_cols);
-    // clang-format on
-    iter = (iter + 1) % num_elements;
-  }
-}
-
-BENCHMARK(BM_MatrixTransposeMatrixMultiplyDynamic)
-    ->Apply(MatrixTransposeMatrixMultiplySizeArguments);
+#undef GEMM_KIND_EQ
+#undef GEMM_KIND_ADD
+#undef GEMM_KIND_SUB
+#undef BENCHMARK_MM_FN
+#undef BENCHMARK_STATIC_MM_FN
+#undef BENCHMARK_DYNAMIC_MM_FN
+#undef BENCHMARK_MTM_FN
+#undef BENCHMARK_DYNAMIC_MMT_FN
+#undef BENCHMARK_STATIC_MMT_FN
 
 }  // namespace internal
 }  // namespace ceres