Further BLAS improvements.
1. Switch to Eigen's implementation when all dimensions are fixed.
2. Use lazyProduct for eigen matrix-vector product. This brings
eigen's performance on iterative_schur closer to what it used
to be before the last commit. There is however still an
improvement to be had by using the naive implementation when
the matrix and vector have dynamic dimensions.
BENCHMARK
HEAD CHANGE
problem-16-22106-pre.txt
gcc-eigen sparse_schur 0.859 gcc-eigen sparse_schur 0.853
clang-eigen sparse_schur 0.848 clang-eigen sparse_schur 0.850
gcc-blas sparse_schur 0.956 gcc-blas sparse_schur 0.865
clang-blas sparse_schur 0.954 clang-blas sparse_schur 0.858
gcc-eigen iterative_schur 4.656 gcc-eigen iterative_schur 3.271
clang-eigen iterative_schur 4.664 clang-eigen iterative_schur 3.307
gcc-blas iterative_schur 2.598 gcc-blas iterative_schur 2.620
clang-blas iterative_schur 2.554 clang-blas iterative_schur 2.567
problem-49-7776-pre.txt
gcc-eigen sparse_schur 0.477 gcc-eigen sparse_schur 0.472
clang-eigen sparse_schur 0.475 clang-eigen sparse_schur 0.479
gcc-blas sparse_schur 0.521 gcc-blas sparse_schur 0.469
clang-blas sparse_schur 0.508 clang-blas sparse_schur 0.471
gcc-eigen iterative_schur 3.172 gcc-eigen iterative_schur 2.088
clang-eigen iterative_schur 3.161 clang-eigen iterative_schur 2.079
gcc-blas iterative_schur 1.701 gcc-blas iterative_schur 1.720
clang-blas iterative_schur 1.708 clang-blas iterative_schur 1.694
problem-245-198739-pre.txt
gcc-eigen sparse_schur 28.092 gcc-eigen sparse_schur 28.233
clang-eigen sparse_schur 28.148 clang-eigen sparse_schur 28.400
gcc-blas sparse_schur 30.919 gcc-blas sparse_schur 28.110
clang-blas sparse_schur 31.001 clang-blas sparse_schur 28.407
gcc-eigen iterative_schur 63.095 gcc-eigen iterative_schur 43.694
clang-eigen iterative_schur 63.412 clang-eigen iterative_schur 43.473
gcc-blas iterative_schur 33.353 gcc-blas iterative_schur 33.321
clang-blas iterative_schur 33.276 clang-blas iterative_schur 33.278
problem-257-65132-pre.txt
gcc-eigen sparse_schur 3.687 gcc-eigen sparse_schur 3.629
clang-eigen sparse_schur 3.669 clang-eigen sparse_schur 3.652
gcc-blas sparse_schur 3.947 gcc-blas sparse_schur 3.673
clang-blas sparse_schur 3.952 clang-blas sparse_schur 3.678
gcc-eigen iterative_schur 121.512 gcc-eigen iterative_schur 76.833
clang-eigen iterative_schur 123.547 clang-eigen iterative_schur 78.763
gcc-blas iterative_schur 68.334 gcc-blas iterative_schur 68.612
clang-blas iterative_schur 67.793 clang-blas iterative_schur 68.266
Notes:
1. Naive BLAS was a bit worse than eigen on fixed sized matrices. We did not see this
before because of the different inlining thresholds. Fixing this boosted eigen's
performance. Also the disparity between gcc and clang has gone away.
2. SPARSE_SCHUR performance remains the same, since it is only testing static sized
matrices.
3. ITERATIVE_SCHUR performance goes up substantially due to the lazyProduct change,
but even there, since most of the products are dynamic sized, the naive implementation
wins handily.
Change-Id: Idc17f35b9c68aaebb1b2e131adf3af8374a85a4c
diff --git a/internal/ceres/blas.h b/internal/ceres/blas.h
index 028b63a..12667d0 100644
--- a/internal/ceres/blas.h
+++ b/internal/ceres/blas.h
@@ -65,6 +65,19 @@
#define CERES_MAYBE_NOALIAS .noalias()
#endif
+// For the matrix-matrix functions below, there are three functions
+// for each functionality. Foo, FooNaive and FooEigen. Foo is the one
+// to be called by the user. FooNaive is a basic loop based
+// implementation and FooEigen uses Eigen's implementation. Foo
+// chooses between FooNaive and FooEigen depending on how many of the
+// template arguments are fixed at compile time. Currently, FooEigen
+// is called if all matrix dimenions are compile time
+// constants. FooNaive is called otherwise. This leads to the best
+// performance currently.
+//
+// TODO(sameeragarwal): Benchmark and simplify the matrix-vector
+// functions.
+
// C op A * B;
//
// where op can be +=, -=, or =.
@@ -98,25 +111,23 @@
// ------------
//
template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
-inline void MatrixMatrixMultiply(const double* A,
- const int num_row_a,
- const int num_col_a,
- const double* B,
- const int num_row_b,
- const int num_col_b,
- double* C,
- const int start_row_c,
- const int start_col_c,
- const int row_stride_c,
- const int col_stride_c) {
-#ifdef CERES_NO_CUSTOM_BLAS
+inline void MatrixMatrixMultiplyEigen(const double* A,
+ const int num_row_a,
+ const int num_col_a,
+ const double* B,
+ const int num_row_b,
+ const int num_col_b,
+ double* C,
+ const int start_row_c,
+ const int start_col_c,
+ const int row_stride_c,
+ const int col_stride_c) {
const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(A, num_row_a, num_col_a);
const typename EigenTypes<kRowB, kColB>::ConstMatrixRef Bref(B, num_row_b, num_col_b);
MatrixRef Cref(C, row_stride_c, col_stride_c);
Eigen::Block<MatrixRef, kRowA, kColB> block(Cref,
start_row_c, start_col_c,
num_row_a, num_col_b);
-
if (kOperation > 0) {
block CERES_MAYBE_NOALIAS += Aref * Bref;
} else if (kOperation < 0) {
@@ -124,9 +135,20 @@
} else {
block CERES_MAYBE_NOALIAS = Aref * Bref;
}
+}
-#else
-
+template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
+inline void MatrixMatrixMultiplyNaive(const double* A,
+ const int num_row_a,
+ const int num_col_a,
+ const double* B,
+ const int num_row_b,
+ const int num_col_b,
+ double* C,
+ const int start_row_c,
+ const int start_col_c,
+ const int row_stride_c,
+ const int col_stride_c) {
DCHECK_GT(num_row_a, 0);
DCHECK_GT(num_col_a, 0);
DCHECK_GT(num_row_b, 0);
@@ -169,9 +191,46 @@
}
}
}
-#endif // CERES_NO_CUSTOM_BLAS
}
+template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
+inline void MatrixMatrixMultiply(const double* A,
+ const int num_row_a,
+ const int num_col_a,
+ const double* B,
+ const int num_row_b,
+ const int num_col_b,
+ double* C,
+ const int start_row_c,
+ const int start_col_c,
+ const int row_stride_c,
+ const int col_stride_c) {
+#ifdef CERES_NO_CUSTOM_BLAS
+ MatrixMatrixMultiplyEigen<kRowA, kColA, kRowB, kColB, kOperation>(
+ A, num_row_a, num_col_a,
+ B, num_row_b, num_col_b,
+ C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+ return;
+
+#else
+
+ if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
+ kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
+ MatrixMatrixMultiplyEigen<kRowA, kColA, kRowB, kColB, kOperation>(
+ A, num_row_a, num_col_a,
+ B, num_row_b, num_col_b,
+ C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+ } else {
+ MatrixMatrixMultiplyNaive<kRowA, kColA, kRowB, kColB, kOperation>(
+ A, num_row_a, num_col_a,
+ B, num_row_b, num_col_b,
+ C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+ }
+
+#endif
+}
+
+
// C op A' * B;
//
// where op can be +=, -=, or =.
@@ -205,25 +264,23 @@
// ------------
//
template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
-inline void MatrixTransposeMatrixMultiply(const double* A,
- const int num_row_a,
- const int num_col_a,
- const double* B,
- const int num_row_b,
- const int num_col_b,
- double* C,
- const int start_row_c,
- const int start_col_c,
- const int row_stride_c,
- const int col_stride_c) {
-#ifdef CERES_NO_CUSTOM_BLAS
+inline void MatrixTransposeMatrixMultiplyEigen(const double* A,
+ const int num_row_a,
+ const int num_col_a,
+ const double* B,
+ const int num_row_b,
+ const int num_col_b,
+ double* C,
+ const int start_row_c,
+ const int start_col_c,
+ const int row_stride_c,
+ const int col_stride_c) {
const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(A, num_row_a, num_col_a);
const typename EigenTypes<kRowB, kColB>::ConstMatrixRef Bref(B, num_row_b, num_col_b);
MatrixRef Cref(C, row_stride_c, col_stride_c);
Eigen::Block<MatrixRef, kColA, kColB> block(Cref,
start_row_c, start_col_c,
num_col_a, num_col_b);
-
if (kOperation > 0) {
block CERES_MAYBE_NOALIAS += Aref.transpose() * Bref;
} else if (kOperation < 0) {
@@ -231,9 +288,20 @@
} else {
block CERES_MAYBE_NOALIAS = Aref.transpose() * Bref;
}
+}
-#else
-
+template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
+inline void MatrixTransposeMatrixMultiplyNaive(const double* A,
+ const int num_row_a,
+ const int num_col_a,
+ const double* B,
+ const int num_row_b,
+ const int num_col_b,
+ double* C,
+ const int start_row_c,
+ const int start_col_c,
+ const int row_stride_c,
+ const int col_stride_c) {
DCHECK_GT(num_row_a, 0);
DCHECK_GT(num_col_a, 0);
DCHECK_GT(num_row_b, 0);
@@ -276,7 +344,43 @@
}
}
}
-#endif // CERES_NO_CUSTOM_BLAS
+}
+
+template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
+inline void MatrixTransposeMatrixMultiply(const double* A,
+ const int num_row_a,
+ const int num_col_a,
+ const double* B,
+ const int num_row_b,
+ const int num_col_b,
+ double* C,
+ const int start_row_c,
+ const int start_col_c,
+ const int row_stride_c,
+ const int col_stride_c) {
+#ifdef CERES_NO_CUSTOM_BLAS
+ MatrixTransposeMatrixMultiplyEigen<kRowA, kColA, kRowB, kColB, kOperation>(
+ A, num_row_a, num_col_a,
+ B, num_row_b, num_col_b,
+ C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+ return;
+
+#else
+
+ if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
+ kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
+ MatrixTransposeMatrixMultiplyEigen<kRowA, kColA, kRowB, kColB, kOperation>(
+ A, num_row_a, num_col_a,
+ B, num_row_b, num_col_b,
+ C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+ } else {
+ MatrixTransposeMatrixMultiplyNaive<kRowA, kColA, kRowB, kColB, kOperation>(
+ A, num_row_a, num_col_a,
+ B, num_row_b, num_col_b,
+ C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+ }
+
+#endif
}
template<int kRowA, int kColA, int kOperation>
@@ -290,14 +394,15 @@
const typename EigenTypes<kColA>::ConstVectorRef bref(b, num_col_a);
typename EigenTypes<kRowA>::VectorRef cref(c, num_row_a);
+ // lazyProduct works better than .noalias() for matrix-vector
+ // products.
if (kOperation > 0) {
- cref.noalias() += Aref * bref;
+ cref += Aref.lazyProduct(bref);
} else if (kOperation < 0) {
- cref.noalias() -= Aref * bref;
+ cref -= Aref.lazyProduct(bref);
} else {
- cref.noalias() = Aref * bref;
+ cref -= Aref.lazyProduct(bref);
}
-
#else
DCHECK_GT(num_row_a, 0);
@@ -347,14 +452,15 @@
const typename EigenTypes<kRowA>::ConstVectorRef bref(b, num_row_a);
typename EigenTypes<kColA>::VectorRef cref(c, num_col_a);
+ // lazyProduct works better than .noalias() for matrix-vector
+ // products.
if (kOperation > 0) {
- cref.noalias() += Aref.transpose() * bref;
+ cref += Aref.transpose().lazyProduct(bref);
} else if (kOperation < 0) {
- cref.noalias() -= Aref.transpose() * bref;
+ cref -= Aref.transpose().lazyProduct(bref);
} else {
- cref.noalias() = Aref.transpose() * bref;
+ cref -= Aref.transpose().lazyProduct(bref);
}
-
#else
DCHECK_GT(num_row_a, 0);