Further BLAS improvements. 1. Switch to Eigen's implementation when all dimensions are fixed. 2. Use lazyProduct for eigen matrix-vector product. This brings eigen's performance on iterative_schur closer to what it used to be before the last commit. There is however still an improvement to be had by using the naive implementation when the matrix and vector have dynamic dimensions. BENCHMARK HEAD CHANGE problem-16-22106-pre.txt gcc-eigen sparse_schur 0.859 gcc-eigen sparse_schur 0.853 clang-eigen sparse_schur 0.848 clang-eigen sparse_schur 0.850 gcc-blas sparse_schur 0.956 gcc-blas sparse_schur 0.865 clang-blas sparse_schur 0.954 clang-blas sparse_schur 0.858 gcc-eigen iterative_schur 4.656 gcc-eigen iterative_schur 3.271 clang-eigen iterative_schur 4.664 clang-eigen iterative_schur 3.307 gcc-blas iterative_schur 2.598 gcc-blas iterative_schur 2.620 clang-blas iterative_schur 2.554 clang-blas iterative_schur 2.567 problem-49-7776-pre.txt gcc-eigen sparse_schur 0.477 gcc-eigen sparse_schur 0.472 clang-eigen sparse_schur 0.475 clang-eigen sparse_schur 0.479 gcc-blas sparse_schur 0.521 gcc-blas sparse_schur 0.469 clang-blas sparse_schur 0.508 clang-blas sparse_schur 0.471 gcc-eigen iterative_schur 3.172 gcc-eigen iterative_schur 2.088 clang-eigen iterative_schur 3.161 clang-eigen iterative_schur 2.079 gcc-blas iterative_schur 1.701 gcc-blas iterative_schur 1.720 clang-blas iterative_schur 1.708 clang-blas iterative_schur 1.694 problem-245-198739-pre.txt gcc-eigen sparse_schur 28.092 gcc-eigen sparse_schur 28.233 clang-eigen sparse_schur 28.148 clang-eigen sparse_schur 28.400 gcc-blas sparse_schur 30.919 gcc-blas sparse_schur 28.110 clang-blas sparse_schur 31.001 clang-blas sparse_schur 28.407 gcc-eigen iterative_schur 63.095 gcc-eigen iterative_schur 43.694 clang-eigen iterative_schur 63.412 clang-eigen iterative_schur 43.473 gcc-blas iterative_schur 33.353 gcc-blas iterative_schur 33.321 clang-blas iterative_schur 33.276 clang-blas iterative_schur 33.278 problem-257-65132-pre.txt gcc-eigen sparse_schur 3.687 gcc-eigen sparse_schur 3.629 clang-eigen sparse_schur 3.669 clang-eigen sparse_schur 3.652 gcc-blas sparse_schur 3.947 gcc-blas sparse_schur 3.673 clang-blas sparse_schur 3.952 clang-blas sparse_schur 3.678 gcc-eigen iterative_schur 121.512 gcc-eigen iterative_schur 76.833 clang-eigen iterative_schur 123.547 clang-eigen iterative_schur 78.763 gcc-blas iterative_schur 68.334 gcc-blas iterative_schur 68.612 clang-blas iterative_schur 67.793 clang-blas iterative_schur 68.266 Notes: 1. Naive BLAS was a bit worse than eigen on fixed sized matrices. We did not see this before because of the different inlining thresholds. Fixing this boosted eigen's performance. Also the disparity between gcc and clang has gone away. 2. SPARSE_SCHUR performance remains the same, since it is only testing static sized matrices. 3. ITERATIVE_SCHUR performance goes up substantially due to the lazyProduct change, but even there, since most of the products are dynamic sized, the naive implementation wins handily. Change-Id: Idc17f35b9c68aaebb1b2e131adf3af8374a85a4c

commit: 520d35ef22dbcb05e344451c03ae64304e524a06 [log] [tgz]
author: Sameer Agarwal <sameeragarwal@google.com> Thu Apr 04 08:16:02 2013 -0700
committer: Sameer Agarwal <sameeragarwal@google.com> Thu Apr 04 18:02:08 2013 -0700
tree: bb95d578a625d1e3e81420933e0c0050ef3fc788
parent: 25ac54807eedf16fd6c34efc390901ee549a7d14 [diff]
diff --git a/internal/ceres/blas.h b/internal/ceres/blas.h
index 028b63a..12667d0 100644
--- a/internal/ceres/blas.h
+++ b/internal/ceres/blas.h

@@ -65,6 +65,19 @@
 #define CERES_MAYBE_NOALIAS .noalias()
 #endif
 
+// For the matrix-matrix functions below, there are three functions
+// for each functionality. Foo, FooNaive and FooEigen. Foo is the one
+// to be called by the user. FooNaive is a basic loop based
+// implementation and FooEigen uses Eigen's implementation. Foo
+// chooses between FooNaive and FooEigen depending on how many of the
+// template arguments are fixed at compile time. Currently, FooEigen
+// is called if all matrix dimenions are compile time
+// constants. FooNaive is called otherwise. This leads to the best
+// performance currently.
+//
+// TODO(sameeragarwal): Benchmark and simplify the matrix-vector
+// functions.
+
 // C op A * B;
 //
 // where op can be +=, -=, or =.
@@ -98,25 +111,23 @@
 //   ------------
 //
 template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
-inline void MatrixMatrixMultiply(const double* A,
-                                 const int num_row_a,
-                                 const int num_col_a,
-                                 const double* B,
-                                 const int num_row_b,
-                                 const int num_col_b,
-                                 double* C,
-                                 const int start_row_c,
-                                 const int start_col_c,
-                                 const int row_stride_c,
-                                 const int col_stride_c) {
-#ifdef CERES_NO_CUSTOM_BLAS
+inline void MatrixMatrixMultiplyEigen(const double* A,
+                                      const int num_row_a,
+                                      const int num_col_a,
+                                      const double* B,
+                                      const int num_row_b,
+                                      const int num_col_b,
+                                      double* C,
+                                      const int start_row_c,
+                                      const int start_col_c,
+                                      const int row_stride_c,
+                                      const int col_stride_c) {
   const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(A, num_row_a, num_col_a);
   const typename EigenTypes<kRowB, kColB>::ConstMatrixRef Bref(B, num_row_b, num_col_b);
   MatrixRef Cref(C, row_stride_c, col_stride_c);
   Eigen::Block<MatrixRef, kRowA, kColB> block(Cref,
                                               start_row_c, start_col_c,
                                               num_row_a, num_col_b);
-
   if (kOperation > 0) {
     block CERES_MAYBE_NOALIAS += Aref * Bref;
   } else if (kOperation < 0) {
@@ -124,9 +135,20 @@
   } else {
     block CERES_MAYBE_NOALIAS = Aref * Bref;
   }
+}
 
-#else
-
+template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
+inline void MatrixMatrixMultiplyNaive(const double* A,
+                                      const int num_row_a,
+                                      const int num_col_a,
+                                      const double* B,
+                                      const int num_row_b,
+                                      const int num_col_b,
+                                      double* C,
+                                      const int start_row_c,
+                                      const int start_col_c,
+                                      const int row_stride_c,
+                                      const int col_stride_c) {
   DCHECK_GT(num_row_a, 0);
   DCHECK_GT(num_col_a, 0);
   DCHECK_GT(num_row_b, 0);
@@ -169,9 +191,46 @@
       }
     }
   }
-#endif  // CERES_NO_CUSTOM_BLAS
 }
 
+template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
+inline void MatrixMatrixMultiply(const double* A,
+                                 const int num_row_a,
+                                 const int num_col_a,
+                                 const double* B,
+                                 const int num_row_b,
+                                 const int num_col_b,
+                                 double* C,
+                                 const int start_row_c,
+                                 const int start_col_c,
+                                 const int row_stride_c,
+                                 const int col_stride_c) {
+#ifdef CERES_NO_CUSTOM_BLAS
+  MatrixMatrixMultiplyEigen<kRowA, kColA, kRowB, kColB, kOperation>(
+      A, num_row_a, num_col_a,
+      B, num_row_b, num_col_b,
+      C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+  return;
+
+#else
+
+  if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
+      kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
+    MatrixMatrixMultiplyEigen<kRowA, kColA, kRowB, kColB, kOperation>(
+        A, num_row_a, num_col_a,
+        B, num_row_b, num_col_b,
+        C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+  } else {
+    MatrixMatrixMultiplyNaive<kRowA, kColA, kRowB, kColB, kOperation>(
+        A, num_row_a, num_col_a,
+        B, num_row_b, num_col_b,
+        C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+  }
+
+#endif
+}
+
+
 // C op A' * B;
 //
 // where op can be +=, -=, or =.
@@ -205,25 +264,23 @@
 //   ------------
 //
 template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
-inline void MatrixTransposeMatrixMultiply(const double* A,
-                                          const int num_row_a,
-                                          const int num_col_a,
-                                          const double* B,
-                                          const int num_row_b,
-                                          const int num_col_b,
-                                          double* C,
-                                          const int start_row_c,
-                                          const int start_col_c,
-                                          const int row_stride_c,
-                                          const int col_stride_c) {
-#ifdef CERES_NO_CUSTOM_BLAS
+inline void MatrixTransposeMatrixMultiplyEigen(const double* A,
+                                               const int num_row_a,
+                                               const int num_col_a,
+                                               const double* B,
+                                               const int num_row_b,
+                                               const int num_col_b,
+                                               double* C,
+                                               const int start_row_c,
+                                               const int start_col_c,
+                                               const int row_stride_c,
+                                               const int col_stride_c) {
   const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(A, num_row_a, num_col_a);
   const typename EigenTypes<kRowB, kColB>::ConstMatrixRef Bref(B, num_row_b, num_col_b);
   MatrixRef Cref(C, row_stride_c, col_stride_c);
   Eigen::Block<MatrixRef, kColA, kColB> block(Cref,
                                               start_row_c, start_col_c,
                                               num_col_a, num_col_b);
-
   if (kOperation > 0) {
     block CERES_MAYBE_NOALIAS += Aref.transpose() * Bref;
   } else if (kOperation < 0) {
@@ -231,9 +288,20 @@
   } else {
     block CERES_MAYBE_NOALIAS = Aref.transpose() * Bref;
   }
+}
 
-#else
-
+template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
+inline void MatrixTransposeMatrixMultiplyNaive(const double* A,
+                                               const int num_row_a,
+                                               const int num_col_a,
+                                               const double* B,
+                                               const int num_row_b,
+                                               const int num_col_b,
+                                               double* C,
+                                               const int start_row_c,
+                                               const int start_col_c,
+                                               const int row_stride_c,
+                                               const int col_stride_c) {
   DCHECK_GT(num_row_a, 0);
   DCHECK_GT(num_col_a, 0);
   DCHECK_GT(num_row_b, 0);
@@ -276,7 +344,43 @@
       }
     }
   }
-#endif  // CERES_NO_CUSTOM_BLAS
+}
+
+template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
+inline void MatrixTransposeMatrixMultiply(const double* A,
+                                          const int num_row_a,
+                                          const int num_col_a,
+                                          const double* B,
+                                          const int num_row_b,
+                                          const int num_col_b,
+                                          double* C,
+                                          const int start_row_c,
+                                          const int start_col_c,
+                                          const int row_stride_c,
+                                          const int col_stride_c) {
+#ifdef CERES_NO_CUSTOM_BLAS
+  MatrixTransposeMatrixMultiplyEigen<kRowA, kColA, kRowB, kColB, kOperation>(
+      A, num_row_a, num_col_a,
+      B, num_row_b, num_col_b,
+      C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+  return;
+
+#else
+
+  if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
+      kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
+    MatrixTransposeMatrixMultiplyEigen<kRowA, kColA, kRowB, kColB, kOperation>(
+        A, num_row_a, num_col_a,
+        B, num_row_b, num_col_b,
+        C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+  } else {
+    MatrixTransposeMatrixMultiplyNaive<kRowA, kColA, kRowB, kColB, kOperation>(
+        A, num_row_a, num_col_a,
+        B, num_row_b, num_col_b,
+        C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+  }
+
+#endif
 }
 
 template<int kRowA, int kColA, int kOperation>
@@ -290,14 +394,15 @@
   const typename EigenTypes<kColA>::ConstVectorRef bref(b, num_col_a);
   typename EigenTypes<kRowA>::VectorRef cref(c, num_row_a);
 
+  // lazyProduct works better than .noalias() for matrix-vector
+  // products.
   if (kOperation > 0) {
-    cref.noalias() += Aref * bref;
+    cref += Aref.lazyProduct(bref);
   } else if (kOperation < 0) {
-    cref.noalias() -= Aref * bref;
+    cref -= Aref.lazyProduct(bref);
   } else {
-    cref.noalias() = Aref * bref;
+    cref -= Aref.lazyProduct(bref);
   }
-
 #else
 
   DCHECK_GT(num_row_a, 0);
@@ -347,14 +452,15 @@
   const typename EigenTypes<kRowA>::ConstVectorRef bref(b, num_row_a);
   typename EigenTypes<kColA>::VectorRef cref(c, num_col_a);
 
+  // lazyProduct works better than .noalias() for matrix-vector
+  // products.
   if (kOperation > 0) {
-    cref.noalias() += Aref.transpose() * bref;
+    cref += Aref.transpose().lazyProduct(bref);
   } else if (kOperation < 0) {
-    cref.noalias() -= Aref.transpose() * bref;
+    cref -= Aref.transpose().lazyProduct(bref);
   } else {
-    cref.noalias() = Aref.transpose() * bref;
+    cref -= Aref.transpose().lazyProduct(bref);
   }
-
 #else
 
   DCHECK_GT(num_row_a, 0);
commit	520d35ef22dbcb05e344451c03ae64304e524a06	[log] [tgz]
author	Sameer Agarwal <sameeragarwal@google.com>	Thu Apr 04 08:16:02 2013 -0700
committer	Sameer Agarwal <sameeragarwal@google.com>	Thu Apr 04 18:02:08 2013 -0700
tree	bb95d578a625d1e3e81420933e0c0050ef3fc788
parent	25ac54807eedf16fd6c34efc390901ee549a7d14 [diff]