Refactor small_blas_gemm_benchmark
This allows benchmarking both dynamic and static problem sizes.
On my X86 2Ghz machine:
CPU Caches:
L1 Data 32 KiB (x48)
L1 Instruction 32 KiB (x48)
L2 Unified 1024 KiB (x48)
L3 Unified 39424 KiB (x2)
----------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
----------------------------------------------------------------------------------------------
BM_MatrixMatrixMultiplyEigen_Static_2x3x4 10.9 ns 10.9 ns 63960496
BM_MatrixMatrixMultiplyEigen_Static_3x3x3 11.9 ns 11.9 ns 58093785
BM_MatrixMatrixMultiplyEigen_Static_4x4x4 24.9 ns 24.9 ns 28540133
BM_MatrixMatrixMultiplyEigen_Static_8x8x8 258 ns 258 ns 2755921
BM_MatrixMatrixMultiplyEigen_Static_9x9x3 255 ns 255 ns 2778628
BM_MatrixMatrixMultiplyEigen_Static_9x3x3 37.7 ns 37.7 ns 18556359
BM_MatrixMatrixMultiplyEigen_Static_3x9x9 220 ns 220 ns 3195163
BM_MatrixMatrixMultiplyEigen_Dynamic_2x3x4 37.8 ns 37.7 ns 18491162
BM_MatrixMatrixMultiplyEigen_Dynamic_3x3x3 48.8 ns 48.8 ns 13724884
BM_MatrixMatrixMultiplyEigen_Dynamic_4x4x4 74.5 ns 74.5 ns 9313146
BM_MatrixMatrixMultiplyEigen_Dynamic_8x8x8 271 ns 271 ns 2595807
BM_MatrixMatrixMultiplyEigen_Dynamic_9x9x3 259 ns 259 ns 2688515
BM_MatrixMatrixMultiplyEigen_Dynamic_9x3x3 123 ns 123 ns 5792115
BM_MatrixMatrixMultiplyEigen_Dynamic_3x9x9 236 ns 236 ns 2963896
BM_MatrixMatrixMultiplyNaive_Static_2x3x4 12.2 ns 12.2 ns 56472772
BM_MatrixMatrixMultiplyNaive_Static_3x3x3 15.5 ns 15.5 ns 44346456
BM_MatrixMatrixMultiplyNaive_Static_4x4x4 41.5 ns 41.5 ns 17196984
BM_MatrixMatrixMultiplyNaive_Static_8x8x8 199 ns 199 ns 3561730
BM_MatrixMatrixMultiplyNaive_Static_9x9x3 148 ns 148 ns 4764814
BM_MatrixMatrixMultiplyNaive_Static_9x3x3 38.4 ns 38.4 ns 17259019
BM_MatrixMatrixMultiplyNaive_Static_3x9x9 115 ns 115 ns 6104752
BM_MatrixMatrixMultiplyNaive_Dynamic_2x3x4 9.66 ns 9.66 ns 74722971
BM_MatrixMatrixMultiplyNaive_Dynamic_3x3x3 13.0 ns 13.0 ns 53435308
BM_MatrixMatrixMultiplyNaive_Dynamic_4x4x4 47.8 ns 47.8 ns 14358184
BM_MatrixMatrixMultiplyNaive_Dynamic_8x8x8 200 ns 200 ns 3572809
BM_MatrixMatrixMultiplyNaive_Dynamic_9x9x3 104 ns 104 ns 6793797
BM_MatrixMatrixMultiplyNaive_Dynamic_9x3x3 34.0 ns 34.0 ns 20790695
BM_MatrixMatrixMultiplyNaive_Dynamic_3x9x9 130 ns 130 ns 5170402
BM_MatrixTransposeMatrixMultiplyEigen_Static_2x3x4 10.3 ns 10.3 ns 69105234
BM_MatrixTransposeMatrixMultiplyEigen_Static_3x3x3 28.9 ns 28.9 ns 24478934
BM_MatrixTransposeMatrixMultiplyEigen_Static_4x4x4 23.7 ns 23.7 ns 29351926
BM_MatrixTransposeMatrixMultiplyEigen_Static_8x8x8 233 ns 233 ns 2929398
BM_MatrixTransposeMatrixMultiplyEigen_Static_9x9x3 211 ns 211 ns 3287409
BM_MatrixTransposeMatrixMultiplyEigen_Static_9x3x3 26.5 ns 26.5 ns 26515136
BM_MatrixTransposeMatrixMultiplyEigen_Static_3x9x9 196 ns 196 ns 3594314
BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_2x3x4 9.05 ns 9.05 ns 77621001
BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_3x3x3 11.1 ns 11.1 ns 62227812
BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_4x4x4 25.5 ns 25.5 ns 27356089
BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_8x8x8 248 ns 248 ns 2834983
BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_9x9x3 229 ns 229 ns 3082369
BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_9x3x3 28.4 ns 28.4 ns 24318629
BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_3x9x9 229 ns 229 ns 3091288
BM_MatrixTransposeMatrixMultiplyNaive_Static_2x3x4 11.0 ns 11.0 ns 63773538
BM_MatrixTransposeMatrixMultiplyNaive_Static_3x3x3 19.1 ns 19.1 ns 37003139
BM_MatrixTransposeMatrixMultiplyNaive_Static_4x4x4 49.1 ns 49.1 ns 14142301
BM_MatrixTransposeMatrixMultiplyNaive_Static_8x8x8 244 ns 244 ns 2874755
BM_MatrixTransposeMatrixMultiplyNaive_Static_9x9x3 140 ns 140 ns 4992156
BM_MatrixTransposeMatrixMultiplyNaive_Static_9x3x3 46.2 ns 46.2 ns 15068317
BM_MatrixTransposeMatrixMultiplyNaive_Static_3x9x9 112 ns 112 ns 6213574
BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_2x3x4 9.74 ns 9.74 ns 72155001
BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_3x3x3 11.5 ns 11.5 ns 60070577
BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_4x4x4 52.5 ns 52.5 ns 13473642
BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_8x8x8 224 ns 224 ns 3124264
BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_9x9x3 98.0 ns 98.0 ns 7199292
BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_9x3x3 34.7 ns 34.6 ns 20203685
BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_3x9x9 105 ns 105 ns 6653151
Change-Id: Iee403b4d27801d1614ecc1f78a1f8c0011514bd7
diff --git a/internal/ceres/small_blas_gemm_benchmark.cc b/internal/ceres/small_blas_gemm_benchmark.cc
index aa6c41d..5248471 100644
--- a/internal/ceres/small_blas_gemm_benchmark.cc
+++ b/internal/ceres/small_blas_gemm_benchmark.cc
@@ -47,10 +47,10 @@
// benchmark.
class MatrixMatrixMultiplyData {
public:
- MatrixMatrixMultiplyData(
- int a_rows, int a_cols, int b_rows, int b_cols, int c_rows, int c_cols)
+ MatrixMatrixMultiplyData(int a_rows, int a_cols, int b_rows, int b_cols,
+ int c_rows, int c_cols)
: num_elements_(1000),
- a_size_(a_rows * a_cols),
+ a_size_(num_elements_ * a_rows * a_cols),
b_size_(b_rows * b_cols),
c_size_(c_rows * c_cols),
a_(num_elements_ * a_size_, 1.00001),
@@ -72,97 +72,130 @@
std::vector<double> c_;
};
-static void MatrixMatrixMultiplySizeArguments(
- benchmark::internal::Benchmark* benchmark) {
- const std::vector<int> b_rows = {1, 2, 3, 4, 6, 8};
- const std::vector<int> b_cols = {1, 2, 3, 4, 8, 12, 15};
- const std::vector<int> c_cols = b_cols;
- for (int i : b_rows) {
- for (int j : b_cols) {
- for (int k : c_cols) {
- benchmark->Args({i, j, k});
- }
- }
- }
-}
+#define GEMM_KIND_EQ 0
+#define GEMM_KIND_ADD 1
+#define GEMM_KIND_SUB -1
-void BM_MatrixMatrixMultiplyDynamic(benchmark::State& state) {
- const int i = state.range(0);
- const int j = state.range(1);
- const int k = state.range(2);
+#define BENCHMARK_MM_FN(FN, M, N, K, NAME, MT, NT, KT) \
+ void static BM_##FN##_##NAME##_##M##x##N##x##K(benchmark::State& state) { \
+ const int b_rows = M; \
+ const int b_cols = N; \
+ const int c_rows = b_cols; \
+ const int c_cols = K; \
+ const int a_rows = b_rows; \
+ const int a_cols = c_cols; \
+ MatrixMatrixMultiplyData data(a_rows, a_cols, b_rows, b_cols, c_rows, \
+ c_cols); \
+ const int num_elements = data.num_elements(); \
+ int iter = 0; \
+ for (auto _ : state) { \
+ FN<MT, KT, KT, NT, GEMM_KIND_ADD>( \
+ data.GetB(iter), b_rows, b_cols, data.GetC(iter), c_rows, c_cols, \
+ data.GetA(iter), 512, 512, a_rows, a_cols); \
+ iter = (iter + 1) % num_elements; \
+ } \
+ } \
+ BENCHMARK(BM_##FN##_##NAME##_##M##x##N##x##K);
- const int b_rows = i;
- const int b_cols = j;
- const int c_rows = b_cols;
- const int c_cols = k;
- const int a_rows = b_rows;
- const int a_cols = c_cols;
+#define BENCHMARK_STATIC_MM_FN(FN, M, N, K) \
+ BENCHMARK_MM_FN(FN, M, N, K, Static, M, N, K)
+#define BENCHMARK_DYNAMIC_MM_FN(FN, M, N, K) \
+ BENCHMARK_MM_FN(FN, M, N, K, Dynamic, Eigen::Dynamic, Eigen::Dynamic, \
+ Eigen::Dynamic)
- MatrixMatrixMultiplyData data(a_rows, a_cols, b_rows, b_cols, c_rows, c_cols);
- const int num_elements = data.num_elements();
+#define BENCHMARK_MTM_FN(FN, M, N, K, NAME, MT, NT, KT) \
+ void static BM_##FN##_##NAME##_##M##x##N##x##K(benchmark::State& state) { \
+ const int b_rows = M; \
+ const int b_cols = N; \
+ const int c_rows = b_rows; \
+ const int c_cols = K; \
+ const int a_rows = b_cols; \
+ const int a_cols = c_cols; \
+ MatrixMatrixMultiplyData data(a_rows, a_cols, b_rows, b_cols, c_rows, \
+ c_cols); \
+ const int num_elements = data.num_elements(); \
+ int iter = 0; \
+ for (auto _ : state) { \
+ FN<KT, MT, KT, NT, GEMM_KIND_ADD>( \
+ data.GetB(iter), b_rows, b_cols, data.GetC(iter), c_rows, c_cols, \
+ data.GetA(iter), 0, 0, a_rows, a_cols); \
+ iter = (iter + 1) % num_elements; \
+ } \
+ } \
+ BENCHMARK(BM_##FN##_##NAME##_##M##x##N##x##K);
- int iter = 0;
- for (auto _ : state) {
- // a += b * c
- // clang-format off
- MatrixMatrixMultiply
- <Eigen::Dynamic, Eigen::Dynamic,Eigen::Dynamic,Eigen::Dynamic, 1>
- (data.GetB(iter), b_rows, b_cols,
- data.GetC(iter), c_rows, c_cols,
- data.GetA(iter), 0, 0, a_rows, a_cols);
- // clang-format on
- iter = (iter + 1) % num_elements;
- }
-}
+#define BENCHMARK_STATIC_MMT_FN(FN, M, N, K) \
+ BENCHMARK_MTM_FN(FN, M, N, K, Static, M, N, K)
+#define BENCHMARK_DYNAMIC_MMT_FN(FN, M, N, K) \
+ BENCHMARK_MTM_FN(FN, M, N, K, Dynamic, Eigen::Dynamic, Eigen::Dynamic, \
+ Eigen::Dynamic)
-BENCHMARK(BM_MatrixMatrixMultiplyDynamic)
- ->Apply(MatrixMatrixMultiplySizeArguments);
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 2, 3, 4)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 3, 3, 3)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 4, 4, 4)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 8, 8, 8)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 9, 9, 3)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 9, 3, 3)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 3, 9, 9)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 2, 3, 4)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 3, 3, 3)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 4, 4, 4)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 8, 8, 8)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 9, 9, 3)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 9, 3, 3)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 3, 9, 9)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 2, 3, 4)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 3, 3, 3)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 4, 4, 4)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 8, 8, 8)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 9, 9, 3)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 9, 3, 3)
+BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 3, 9, 9)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 2, 3, 4)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 3, 3, 3)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 4, 4, 4)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 8, 8, 8)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 9, 9, 3)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 9, 3, 3)
+BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 3, 9, 9)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 2, 3, 4)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 3, 3, 3)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 4, 4, 4)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 8, 8, 8)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 9, 9, 3)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 9, 3, 3)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 3, 9, 9)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 2, 3, 4)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 3, 3, 3)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 4, 4, 4)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 8, 8, 8)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 9, 9, 3)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 9, 3, 3)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 3, 9, 9)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 2, 3, 4)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 3, 3, 3)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 4, 4, 4)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 8, 8, 8)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 9, 9, 3)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 9, 3, 3)
+BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 3, 9, 9)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 2, 3, 4)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 3, 3, 3)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 4, 4, 4)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 8, 8, 8)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 9, 9, 3)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 9, 3, 3)
+BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 3, 9, 9)
-static void MatrixTransposeMatrixMultiplySizeArguments(
- benchmark::internal::Benchmark* benchmark) {
- std::vector<int> b_rows = {1, 2, 3, 4, 6, 8};
- std::vector<int> b_cols = {1, 2, 3, 4, 8, 12, 15};
- std::vector<int> c_cols = b_rows;
- for (int i : b_rows) {
- for (int j : b_cols) {
- for (int k : c_cols) {
- benchmark->Args({i, j, k});
- }
- }
- }
-}
-
-void BM_MatrixTransposeMatrixMultiplyDynamic(benchmark::State& state) {
- const int i = state.range(0);
- const int j = state.range(1);
- const int k = state.range(2);
-
- const int b_rows = i;
- const int b_cols = j;
- const int c_rows = b_rows;
- const int c_cols = k;
- const int a_rows = b_cols;
- const int a_cols = c_cols;
-
- MatrixMatrixMultiplyData data(a_rows, a_cols, b_rows, b_cols, c_rows, c_cols);
- const int num_elements = data.num_elements();
-
- int iter = 0;
- for (auto _ : state) {
- // a += b' * c
- // clang-format off
- MatrixTransposeMatrixMultiply
- <Eigen::Dynamic,Eigen::Dynamic,Eigen::Dynamic,Eigen::Dynamic, 1>
- (data.GetB(iter), b_rows, b_cols,
- data.GetC(iter), c_rows, c_cols,
- data.GetA(iter), 0, 0, a_rows, a_cols);
- // clang-format on
- iter = (iter + 1) % num_elements;
- }
-}
-
-BENCHMARK(BM_MatrixTransposeMatrixMultiplyDynamic)
- ->Apply(MatrixTransposeMatrixMultiplySizeArguments);
+#undef GEMM_KIND_EQ
+#undef GEMM_KIND_ADD
+#undef GEMM_KIND_SUB
+#undef BENCHMARK_MM_FN
+#undef BENCHMARK_STATIC_MM_FN
+#undef BENCHMARK_DYNAMIC_MM_FN
+#undef BENCHMARK_MTM_FN
+#undef BENCHMARK_DYNAMIC_MMT_FN
+#undef BENCHMARK_STATIC_MMT_FN
} // namespace internal
} // namespace ceres