Replace more instances of Eigen GEMV with Ceres BLAS.

With this ITERATIVE_SCHUR with JACOBI preconditioner went down from
280 seconds to 150 seconds on problem-744-543562-pre.txt.

Change-Id: I4f319c1108421e8d59f58654a4c0576ad65df609
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1f234c7..eee67fa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -654,9 +654,13 @@
       "${CMAKE_CXX_FLAGS} -Werror -Wall -Wextra -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter")
 ENDIF ("${UNIX}")
 
-# We can be even stricter when using CLang
+# Use a larger inlining threshold for Clang, since it hobbles Eigen,
+# resulting in an unreasonably slow version of the blas routines. The
+# -Qunused-arguments is needed because CMake passes the inline
+# threshold to the linker and clang complains about it and dies.
 IF ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-return-type-c-linkage")
+  SET(CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} -Qunused-arguments -mllvm -inline-threshold=600 -Wno-return-type-c-linkage")
 ENDIF()
 
 ADD_SUBDIRECTORY(internal/ceres)
diff --git a/internal/ceres/block_sparse_matrix.cc b/internal/ceres/block_sparse_matrix.cc
index dbe5ec9..ab6fcef 100644
--- a/internal/ceres/block_sparse_matrix.cc
+++ b/internal/ceres/block_sparse_matrix.cc
@@ -33,6 +33,7 @@
 #include <cstddef>
 #include <algorithm>
 #include <vector>
+#include "ceres/blas.h"
 #include "ceres/block_structure.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/matrix_proto.h"
@@ -117,16 +118,15 @@
   for (int i = 0; i < block_structure_->rows.size(); ++i) {
     int row_block_pos = block_structure_->rows[i].block.position;
     int row_block_size = block_structure_->rows[i].block.size;
-    VectorRef yref(y + row_block_pos, row_block_size);
     const vector<Cell>& cells = block_structure_->rows[i].cells;
     for (int j = 0; j < cells.size(); ++j) {
       int col_block_id = cells[j].block_id;
       int col_block_size = block_structure_->cols[col_block_id].size;
       int col_block_pos = block_structure_->cols[col_block_id].position;
-      ConstVectorRef xref(x + col_block_pos, col_block_size);
-      MatrixRef m(values_.get() + cells[j].position,
-                  row_block_size, col_block_size);
-      yref += m.lazyProduct(xref);
+      MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+          values_.get() + cells[j].position, row_block_size, col_block_size,
+          x + col_block_pos,
+          y + row_block_pos);
     }
   }
 }
@@ -138,16 +138,16 @@
   for (int i = 0; i < block_structure_->rows.size(); ++i) {
     int row_block_pos = block_structure_->rows[i].block.position;
     int row_block_size = block_structure_->rows[i].block.size;
-    const ConstVectorRef xref(x + row_block_pos, row_block_size);
     const vector<Cell>& cells = block_structure_->rows[i].cells;
     for (int j = 0; j < cells.size(); ++j) {
       int col_block_id = cells[j].block_id;
       int col_block_size = block_structure_->cols[col_block_id].size;
       int col_block_pos = block_structure_->cols[col_block_id].position;
-      VectorRef yref(y + col_block_pos, col_block_size);
-      MatrixRef m(values_.get() + cells[j].position,
-                  row_block_size, col_block_size);
-      yref += m.transpose().lazyProduct(xref);
+      MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+          values_.get() + cells[j].position, row_block_size, col_block_size,
+          x + row_block_pos,
+          y + col_block_pos);
+
     }
   }
 }
diff --git a/internal/ceres/partitioned_matrix_view.cc b/internal/ceres/partitioned_matrix_view.cc
index 0722fc8..c488184 100644
--- a/internal/ceres/partitioned_matrix_view.cc
+++ b/internal/ceres/partitioned_matrix_view.cc
@@ -35,6 +35,7 @@
 #include <algorithm>
 #include <cstring>
 #include <vector>
+#include "ceres/blas.h"
 #include "ceres/block_sparse_matrix.h"
 #include "ceres/block_structure.h"
 #include "ceres/internal/eigen.h"
@@ -103,13 +104,10 @@
     const int col_block_id = cell.block_id;
     const int col_block_pos = bs->cols[col_block_id].position;
     const int col_block_size = bs->cols[col_block_id].size;
-
-    ConstVectorRef xref(x + col_block_pos, col_block_size);
-    VectorRef yref(y + row_block_pos, row_block_size);
-    ConstMatrixRef m(row_values + cell.position,
-                     row_block_size,
-                     col_block_size);
-    yref += m.lazyProduct(xref);
+    MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+        row_values + cell.position, row_block_size, col_block_size,
+        x + col_block_pos,
+        y + row_block_pos);
   }
 }
 
@@ -124,20 +122,16 @@
   for (int r = 0; r < bs->rows.size(); ++r) {
     const int row_block_pos = bs->rows[r].block.position;
     const int row_block_size = bs->rows[r].block.size;
-    VectorRef yref(y + row_block_pos, row_block_size);
     const vector<Cell>& cells = bs->rows[r].cells;
     for (int c = (r < num_row_blocks_e_) ? 1 : 0; c < cells.size(); ++c) {
       const double* row_values = matrix_.RowBlockValues(r);
       const int col_block_id = cells[c].block_id;
       const int col_block_pos = bs->cols[col_block_id].position;
       const int col_block_size = bs->cols[col_block_id].size;
-
-      ConstVectorRef xref(x + col_block_pos - num_cols_e(),
-                          col_block_size);
-      ConstMatrixRef m(row_values + cells[c].position,
-                       row_block_size,
-                       col_block_size);
-      yref += m.lazyProduct(xref);
+      MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+          row_values + cells[c].position, row_block_size, col_block_size,
+          x + col_block_pos - num_cols_e(),
+          y + row_block_pos);
     }
   }
 }
@@ -155,13 +149,10 @@
     const int col_block_id = cell.block_id;
     const int col_block_pos = bs->cols[col_block_id].position;
     const int col_block_size = bs->cols[col_block_id].size;
-
-    ConstVectorRef xref(x + row_block_pos, row_block_size);
-    VectorRef yref(y + col_block_pos, col_block_size);
-    ConstMatrixRef m(row_values + cell.position,
-                     row_block_size,
-                     col_block_size);
-    yref += m.transpose().lazyProduct(xref);
+    MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+        row_values + cell.position, row_block_size, col_block_size,
+        x + row_block_pos,
+        y + col_block_pos);
   }
 }
 
@@ -176,19 +167,16 @@
   for (int r = 0; r < bs->rows.size(); ++r) {
     const int row_block_pos = bs->rows[r].block.position;
     const int row_block_size = bs->rows[r].block.size;
-    ConstVectorRef xref(x + row_block_pos, row_block_size);
     const vector<Cell>& cells = bs->rows[r].cells;
     for (int c = (r < num_row_blocks_e_) ? 1 : 0; c < cells.size(); ++c) {
       const double* row_values = matrix_.RowBlockValues(r);
       const int col_block_id = cells[c].block_id;
       const int col_block_pos = bs->cols[col_block_id].position;
       const int col_block_size = bs->cols[col_block_id].size;
-
-      VectorRef yref(y + col_block_pos - num_cols_e(), col_block_size);
-      ConstMatrixRef m(row_values + cells[c].position,
-                       row_block_size,
-                       col_block_size);
-      yref += m.transpose().lazyProduct(xref);
+      MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+        row_values + cells[c].position, row_block_size, col_block_size,
+        x + row_block_pos,
+        y + col_block_pos - num_cols_e());
     }
   }
 }
@@ -267,15 +255,15 @@
     const int row_block_size = bs->rows[r].block.size;
     const int block_id = cell.block_id;
     const int col_block_size = bs->cols[block_id].size;
-    ConstMatrixRef m(row_values + cell.position,
-                           row_block_size,
-                           col_block_size);
-
     const int cell_position =
         block_diagonal_structure->rows[block_id].cells[0].position;
 
-    MatrixRef(block_diagonal->mutable_values() + cell_position,
-              col_block_size, col_block_size).noalias() += m.transpose() * m;
+    MatrixTransposeMatrixMultiply
+        <Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, 1>(
+            row_values + cell.position, row_block_size, col_block_size,
+            row_values + cell.position, row_block_size, col_block_size,
+            block_diagonal->mutable_values() + cell_position,
+            0, 0, col_block_size, col_block_size);
   }
 }
 
@@ -298,15 +286,16 @@
     for (int c = (r < num_row_blocks_e_) ? 1 : 0; c < cells.size(); ++c) {
       const int col_block_id = cells[c].block_id;
       const int col_block_size = bs->cols[col_block_id].size;
-      ConstMatrixRef m(row_values + cells[c].position,
-                       row_block_size,
-                       col_block_size);
       const int diagonal_block_id = col_block_id - num_col_blocks_e_;
       const int cell_position =
           block_diagonal_structure->rows[diagonal_block_id].cells[0].position;
 
-      MatrixRef(block_diagonal->mutable_values() + cell_position,
-                col_block_size, col_block_size).noalias() += m.transpose() * m;
+      MatrixTransposeMatrixMultiply
+          <Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, 1>(
+              row_values + cells[c].position, row_block_size, col_block_size,
+              row_values + cells[c].position, row_block_size, col_block_size,
+              block_diagonal->mutable_values() + cell_position,
+              0, 0, col_block_size, col_block_size);
     }
   }
 }