Refactor small_blas_gemm_benchmark This allows benchmarking both dynamic and static problem sizes. On my X86 2Ghz machine: CPU Caches: L1 Data 32 KiB (x48) L1 Instruction 32 KiB (x48) L2 Unified 1024 KiB (x48) L3 Unified 39424 KiB (x2) ---------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------------------------------------------- BM_MatrixMatrixMultiplyEigen_Static_2x3x4 10.9 ns 10.9 ns 63960496 BM_MatrixMatrixMultiplyEigen_Static_3x3x3 11.9 ns 11.9 ns 58093785 BM_MatrixMatrixMultiplyEigen_Static_4x4x4 24.9 ns 24.9 ns 28540133 BM_MatrixMatrixMultiplyEigen_Static_8x8x8 258 ns 258 ns 2755921 BM_MatrixMatrixMultiplyEigen_Static_9x9x3 255 ns 255 ns 2778628 BM_MatrixMatrixMultiplyEigen_Static_9x3x3 37.7 ns 37.7 ns 18556359 BM_MatrixMatrixMultiplyEigen_Static_3x9x9 220 ns 220 ns 3195163 BM_MatrixMatrixMultiplyEigen_Dynamic_2x3x4 37.8 ns 37.7 ns 18491162 BM_MatrixMatrixMultiplyEigen_Dynamic_3x3x3 48.8 ns 48.8 ns 13724884 BM_MatrixMatrixMultiplyEigen_Dynamic_4x4x4 74.5 ns 74.5 ns 9313146 BM_MatrixMatrixMultiplyEigen_Dynamic_8x8x8 271 ns 271 ns 2595807 BM_MatrixMatrixMultiplyEigen_Dynamic_9x9x3 259 ns 259 ns 2688515 BM_MatrixMatrixMultiplyEigen_Dynamic_9x3x3 123 ns 123 ns 5792115 BM_MatrixMatrixMultiplyEigen_Dynamic_3x9x9 236 ns 236 ns 2963896 BM_MatrixMatrixMultiplyNaive_Static_2x3x4 12.2 ns 12.2 ns 56472772 BM_MatrixMatrixMultiplyNaive_Static_3x3x3 15.5 ns 15.5 ns 44346456 BM_MatrixMatrixMultiplyNaive_Static_4x4x4 41.5 ns 41.5 ns 17196984 BM_MatrixMatrixMultiplyNaive_Static_8x8x8 199 ns 199 ns 3561730 BM_MatrixMatrixMultiplyNaive_Static_9x9x3 148 ns 148 ns 4764814 BM_MatrixMatrixMultiplyNaive_Static_9x3x3 38.4 ns 38.4 ns 17259019 BM_MatrixMatrixMultiplyNaive_Static_3x9x9 115 ns 115 ns 6104752 BM_MatrixMatrixMultiplyNaive_Dynamic_2x3x4 9.66 ns 9.66 ns 74722971 BM_MatrixMatrixMultiplyNaive_Dynamic_3x3x3 13.0 ns 13.0 ns 53435308 BM_MatrixMatrixMultiplyNaive_Dynamic_4x4x4 47.8 ns 47.8 ns 14358184 BM_MatrixMatrixMultiplyNaive_Dynamic_8x8x8 200 ns 200 ns 3572809 BM_MatrixMatrixMultiplyNaive_Dynamic_9x9x3 104 ns 104 ns 6793797 BM_MatrixMatrixMultiplyNaive_Dynamic_9x3x3 34.0 ns 34.0 ns 20790695 BM_MatrixMatrixMultiplyNaive_Dynamic_3x9x9 130 ns 130 ns 5170402 BM_MatrixTransposeMatrixMultiplyEigen_Static_2x3x4 10.3 ns 10.3 ns 69105234 BM_MatrixTransposeMatrixMultiplyEigen_Static_3x3x3 28.9 ns 28.9 ns 24478934 BM_MatrixTransposeMatrixMultiplyEigen_Static_4x4x4 23.7 ns 23.7 ns 29351926 BM_MatrixTransposeMatrixMultiplyEigen_Static_8x8x8 233 ns 233 ns 2929398 BM_MatrixTransposeMatrixMultiplyEigen_Static_9x9x3 211 ns 211 ns 3287409 BM_MatrixTransposeMatrixMultiplyEigen_Static_9x3x3 26.5 ns 26.5 ns 26515136 BM_MatrixTransposeMatrixMultiplyEigen_Static_3x9x9 196 ns 196 ns 3594314 BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_2x3x4 9.05 ns 9.05 ns 77621001 BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_3x3x3 11.1 ns 11.1 ns 62227812 BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_4x4x4 25.5 ns 25.5 ns 27356089 BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_8x8x8 248 ns 248 ns 2834983 BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_9x9x3 229 ns 229 ns 3082369 BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_9x3x3 28.4 ns 28.4 ns 24318629 BM_MatrixTransposeMatrixMultiplyEigen_Dynamic_3x9x9 229 ns 229 ns 3091288 BM_MatrixTransposeMatrixMultiplyNaive_Static_2x3x4 11.0 ns 11.0 ns 63773538 BM_MatrixTransposeMatrixMultiplyNaive_Static_3x3x3 19.1 ns 19.1 ns 37003139 BM_MatrixTransposeMatrixMultiplyNaive_Static_4x4x4 49.1 ns 49.1 ns 14142301 BM_MatrixTransposeMatrixMultiplyNaive_Static_8x8x8 244 ns 244 ns 2874755 BM_MatrixTransposeMatrixMultiplyNaive_Static_9x9x3 140 ns 140 ns 4992156 BM_MatrixTransposeMatrixMultiplyNaive_Static_9x3x3 46.2 ns 46.2 ns 15068317 BM_MatrixTransposeMatrixMultiplyNaive_Static_3x9x9 112 ns 112 ns 6213574 BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_2x3x4 9.74 ns 9.74 ns 72155001 BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_3x3x3 11.5 ns 11.5 ns 60070577 BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_4x4x4 52.5 ns 52.5 ns 13473642 BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_8x8x8 224 ns 224 ns 3124264 BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_9x9x3 98.0 ns 98.0 ns 7199292 BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_9x3x3 34.7 ns 34.6 ns 20203685 BM_MatrixTransposeMatrixMultiplyNaive_Dynamic_3x9x9 105 ns 105 ns 6653151 Change-Id: Iee403b4d27801d1614ecc1f78a1f8c0011514bd7
diff --git a/internal/ceres/small_blas_gemm_benchmark.cc b/internal/ceres/small_blas_gemm_benchmark.cc index aa6c41d..5248471 100644 --- a/internal/ceres/small_blas_gemm_benchmark.cc +++ b/internal/ceres/small_blas_gemm_benchmark.cc
@@ -47,10 +47,10 @@ // benchmark. class MatrixMatrixMultiplyData { public: - MatrixMatrixMultiplyData( - int a_rows, int a_cols, int b_rows, int b_cols, int c_rows, int c_cols) + MatrixMatrixMultiplyData(int a_rows, int a_cols, int b_rows, int b_cols, + int c_rows, int c_cols) : num_elements_(1000), - a_size_(a_rows * a_cols), + a_size_(num_elements_ * a_rows * a_cols), b_size_(b_rows * b_cols), c_size_(c_rows * c_cols), a_(num_elements_ * a_size_, 1.00001), @@ -72,97 +72,130 @@ std::vector<double> c_; }; -static void MatrixMatrixMultiplySizeArguments( - benchmark::internal::Benchmark* benchmark) { - const std::vector<int> b_rows = {1, 2, 3, 4, 6, 8}; - const std::vector<int> b_cols = {1, 2, 3, 4, 8, 12, 15}; - const std::vector<int> c_cols = b_cols; - for (int i : b_rows) { - for (int j : b_cols) { - for (int k : c_cols) { - benchmark->Args({i, j, k}); - } - } - } -} +#define GEMM_KIND_EQ 0 +#define GEMM_KIND_ADD 1 +#define GEMM_KIND_SUB -1 -void BM_MatrixMatrixMultiplyDynamic(benchmark::State& state) { - const int i = state.range(0); - const int j = state.range(1); - const int k = state.range(2); +#define BENCHMARK_MM_FN(FN, M, N, K, NAME, MT, NT, KT) \ + void static BM_##FN##_##NAME##_##M##x##N##x##K(benchmark::State& state) { \ + const int b_rows = M; \ + const int b_cols = N; \ + const int c_rows = b_cols; \ + const int c_cols = K; \ + const int a_rows = b_rows; \ + const int a_cols = c_cols; \ + MatrixMatrixMultiplyData data(a_rows, a_cols, b_rows, b_cols, c_rows, \ + c_cols); \ + const int num_elements = data.num_elements(); \ + int iter = 0; \ + for (auto _ : state) { \ + FN<MT, KT, KT, NT, GEMM_KIND_ADD>( \ + data.GetB(iter), b_rows, b_cols, data.GetC(iter), c_rows, c_cols, \ + data.GetA(iter), 512, 512, a_rows, a_cols); \ + iter = (iter + 1) % num_elements; \ + } \ + } \ + BENCHMARK(BM_##FN##_##NAME##_##M##x##N##x##K); - const int b_rows = i; - const int b_cols = j; - const int c_rows = b_cols; - const int c_cols = k; - const int a_rows = b_rows; - const int a_cols = c_cols; +#define BENCHMARK_STATIC_MM_FN(FN, M, N, K) \ + BENCHMARK_MM_FN(FN, M, N, K, Static, M, N, K) +#define BENCHMARK_DYNAMIC_MM_FN(FN, M, N, K) \ + BENCHMARK_MM_FN(FN, M, N, K, Dynamic, Eigen::Dynamic, Eigen::Dynamic, \ + Eigen::Dynamic) - MatrixMatrixMultiplyData data(a_rows, a_cols, b_rows, b_cols, c_rows, c_cols); - const int num_elements = data.num_elements(); +#define BENCHMARK_MTM_FN(FN, M, N, K, NAME, MT, NT, KT) \ + void static BM_##FN##_##NAME##_##M##x##N##x##K(benchmark::State& state) { \ + const int b_rows = M; \ + const int b_cols = N; \ + const int c_rows = b_rows; \ + const int c_cols = K; \ + const int a_rows = b_cols; \ + const int a_cols = c_cols; \ + MatrixMatrixMultiplyData data(a_rows, a_cols, b_rows, b_cols, c_rows, \ + c_cols); \ + const int num_elements = data.num_elements(); \ + int iter = 0; \ + for (auto _ : state) { \ + FN<KT, MT, KT, NT, GEMM_KIND_ADD>( \ + data.GetB(iter), b_rows, b_cols, data.GetC(iter), c_rows, c_cols, \ + data.GetA(iter), 0, 0, a_rows, a_cols); \ + iter = (iter + 1) % num_elements; \ + } \ + } \ + BENCHMARK(BM_##FN##_##NAME##_##M##x##N##x##K); - int iter = 0; - for (auto _ : state) { - // a += b * c - // clang-format off - MatrixMatrixMultiply - <Eigen::Dynamic, Eigen::Dynamic,Eigen::Dynamic,Eigen::Dynamic, 1> - (data.GetB(iter), b_rows, b_cols, - data.GetC(iter), c_rows, c_cols, - data.GetA(iter), 0, 0, a_rows, a_cols); - // clang-format on - iter = (iter + 1) % num_elements; - } -} +#define BENCHMARK_STATIC_MMT_FN(FN, M, N, K) \ + BENCHMARK_MTM_FN(FN, M, N, K, Static, M, N, K) +#define BENCHMARK_DYNAMIC_MMT_FN(FN, M, N, K) \ + BENCHMARK_MTM_FN(FN, M, N, K, Dynamic, Eigen::Dynamic, Eigen::Dynamic, \ + Eigen::Dynamic) -BENCHMARK(BM_MatrixMatrixMultiplyDynamic) - ->Apply(MatrixMatrixMultiplySizeArguments); +BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 2, 3, 4) +BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 3, 3, 3) +BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 4, 4, 4) +BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 8, 8, 8) +BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 9, 9, 3) +BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 9, 3, 3) +BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyEigen, 3, 9, 9) +BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 2, 3, 4) +BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 3, 3, 3) +BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 4, 4, 4) +BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 8, 8, 8) +BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 9, 9, 3) +BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 9, 3, 3) +BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyEigen, 3, 9, 9) +BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 2, 3, 4) +BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 3, 3, 3) +BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 4, 4, 4) +BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 8, 8, 8) +BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 9, 9, 3) +BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 9, 3, 3) +BENCHMARK_STATIC_MM_FN(MatrixMatrixMultiplyNaive, 3, 9, 9) +BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 2, 3, 4) +BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 3, 3, 3) +BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 4, 4, 4) +BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 8, 8, 8) +BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 9, 9, 3) +BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 9, 3, 3) +BENCHMARK_DYNAMIC_MM_FN(MatrixMatrixMultiplyNaive, 3, 9, 9) +BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 2, 3, 4) +BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 3, 3, 3) +BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 4, 4, 4) +BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 8, 8, 8) +BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 9, 9, 3) +BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 9, 3, 3) +BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 3, 9, 9) +BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 2, 3, 4) +BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 3, 3, 3) +BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 4, 4, 4) +BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 8, 8, 8) +BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 9, 9, 3) +BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 9, 3, 3) +BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyEigen, 3, 9, 9) +BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 2, 3, 4) +BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 3, 3, 3) +BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 4, 4, 4) +BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 8, 8, 8) +BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 9, 9, 3) +BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 9, 3, 3) +BENCHMARK_STATIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 3, 9, 9) +BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 2, 3, 4) +BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 3, 3, 3) +BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 4, 4, 4) +BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 8, 8, 8) +BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 9, 9, 3) +BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 9, 3, 3) +BENCHMARK_DYNAMIC_MMT_FN(MatrixTransposeMatrixMultiplyNaive, 3, 9, 9) -static void MatrixTransposeMatrixMultiplySizeArguments( - benchmark::internal::Benchmark* benchmark) { - std::vector<int> b_rows = {1, 2, 3, 4, 6, 8}; - std::vector<int> b_cols = {1, 2, 3, 4, 8, 12, 15}; - std::vector<int> c_cols = b_rows; - for (int i : b_rows) { - for (int j : b_cols) { - for (int k : c_cols) { - benchmark->Args({i, j, k}); - } - } - } -} - -void BM_MatrixTransposeMatrixMultiplyDynamic(benchmark::State& state) { - const int i = state.range(0); - const int j = state.range(1); - const int k = state.range(2); - - const int b_rows = i; - const int b_cols = j; - const int c_rows = b_rows; - const int c_cols = k; - const int a_rows = b_cols; - const int a_cols = c_cols; - - MatrixMatrixMultiplyData data(a_rows, a_cols, b_rows, b_cols, c_rows, c_cols); - const int num_elements = data.num_elements(); - - int iter = 0; - for (auto _ : state) { - // a += b' * c - // clang-format off - MatrixTransposeMatrixMultiply - <Eigen::Dynamic,Eigen::Dynamic,Eigen::Dynamic,Eigen::Dynamic, 1> - (data.GetB(iter), b_rows, b_cols, - data.GetC(iter), c_rows, c_cols, - data.GetA(iter), 0, 0, a_rows, a_cols); - // clang-format on - iter = (iter + 1) % num_elements; - } -} - -BENCHMARK(BM_MatrixTransposeMatrixMultiplyDynamic) - ->Apply(MatrixTransposeMatrixMultiplySizeArguments); +#undef GEMM_KIND_EQ +#undef GEMM_KIND_ADD +#undef GEMM_KIND_SUB +#undef BENCHMARK_MM_FN +#undef BENCHMARK_STATIC_MM_FN +#undef BENCHMARK_DYNAMIC_MM_FN +#undef BENCHMARK_MTM_FN +#undef BENCHMARK_DYNAMIC_MMT_FN +#undef BENCHMARK_STATIC_MMT_FN } // namespace internal } // namespace ceres