Minor cleanups.

1. Further BLAS and heap allocation cleanups in  schur_eliminator_impl.h
2. Modularize blas.h using macros.
3. Lint cleanups from William Rucklidge.
4. Small changes to jet.h
5. ResidualBlock now uses blas.h

Performance improvements:

For static and dynamic sized blocks, the peformance is not changed much.

-use_quaternions -ordering user -linear_solver sparse_schur

                                        master         change
problem: 16-22106
gcc                                        3.4            3.3
clang                                      2.8            2.7

problem: 49-7776
gcc                                        1.7            1.7
clang                                      1.4            1.4

problem: 245-198739
gcc                                       80.1           79.6
clang                                     80.6           76.2

problem: 257-65132
gcc                                       12.2           12.0
clang                                     10.4           10.2

problem: 356-226730
gcc                                       99.0           96.8
clang                                     88.9           88.3

problem: 744-543562
gcc                                      361.5          356.2
clang                                    352.7          343.5

problem: 1024-110968
gcc                                       45.9           45.6
clang                                     42.6           42.1

However, performance when using local parameterizations is
significantly improved due to residual_block.cc using blas.h

-use_quaternions -use_local_parameterization -ordering user -linear_solver sparse_schur

                                        master         change
problem: 16-22106
gcc                                        3.6            3.3
clang                                      3.5            2.8

problem: 49-7776
gcc                                        1.8            1.6
clang                                      1.7            1.4

problem: 245-198739
gcc                                       79.7           76.1
clang                                     79.7           73.0

problem: 257-65132
gcc                                       12.8           11.9
clang                                     12.3            9.8

problem: 356-226730
gcc                                      101.9           93.5
clang                                    105.0           86.8

problem: 744-543562
gcc                                      367.9          350.5
clang                                    355.3          323.1

problem: 1024-110968
gcc                                       43.0           40.3
clang                                     41.0           37.5

Change-Id: I6dcf7476ddaa77cb116558d112a9cf1e832f5fc9
diff --git a/internal/ceres/schur_eliminator_impl.h b/internal/ceres/schur_eliminator_impl.h
index b46eab9..5afb2f2 100644
--- a/internal/ceres/schur_eliminator_impl.h
+++ b/internal/ceres/schur_eliminator_impl.h
@@ -51,16 +51,18 @@
 
 #include <algorithm>
 #include <map>
-#include "Eigen/Dense"
+
 #include "ceres/blas.h"
 #include "ceres/block_random_access_matrix.h"
 #include "ceres/block_sparse_matrix.h"
 #include "ceres/block_structure.h"
 #include "ceres/internal/eigen.h"
+#include "ceres/internal/fixed_array.h"
 #include "ceres/internal/scoped_ptr.h"
 #include "ceres/map_util.h"
 #include "ceres/schur_eliminator.h"
 #include "ceres/stl_util.h"
+#include "Eigen/Dense"
 #include "glog/logging.h"
 
 namespace ceres {
@@ -240,8 +242,9 @@
       ete.setZero();
     }
 
-    typename EigenTypes<kEBlockSize>::Vector g(e_block_size);
-    g.setZero();
+    FixedArray<double, 8> g(e_block_size);
+    typename EigenTypes<kEBlockSize>::VectorRef gref(g.get(), e_block_size);
+    gref.setZero();
 
     // We are going to be computing
     //
@@ -256,7 +259,7 @@
     // in this chunk (g) and add the outer product of the f_blocks to
     // Schur complement (S += F'F).
     ChunkDiagonalBlockAndGradient(
-        chunk, A, b, chunk.start, &ete, &g, buffer, lhs);
+        chunk, A, b, chunk.start, &ete, g.get(), buffer, lhs);
 
     // Normally one wouldn't compute the inverse explicitly, but
     // e_block_size will typically be a small number like 3, in
@@ -273,7 +276,12 @@
     // linear system.
     //
     //   rhs = F'b - F'E(E'E)^(-1) E'b
-    UpdateRhs(chunk, A, b, chunk.start, inverse_ete * g, rhs);
+
+    FixedArray<double, 8> inverse_ete_g(e_block_size);
+    MatrixVectorMultiply<kEBlockSize, kEBlockSize, 0>(
+        inverse_ete.data(), e_block_size, e_block_size, g.get(), inverse_ete_g.get());
+
+    UpdateRhs(chunk, A, b, chunk.start, inverse_ete_g.get(), rhs);
 
     // S -= F'E(E'E)^{-1}E'F
     ChunkOuterProduct(bs, inverse_ete, buffer, chunk.buffer_layout, lhs);
@@ -318,7 +326,9 @@
       const Cell& e_cell = row.cells.front();
       DCHECK_EQ(e_block_id, e_cell.block_id);
 
-      typename EigenTypes<kRowBlockSize>::Vector sj =
+      FixedArray<double, 8> sj(row.block.size);
+
+      typename EigenTypes<kRowBlockSize>::VectorRef(sj.get(), row.block.size) =
           typename EigenTypes<kRowBlockSize>::ConstVectorRef
           (b + bs->rows[chunk.start + j].block.position, row.block.size);
 
@@ -330,16 +340,16 @@
         MatrixVectorMultiply<kRowBlockSize, kFBlockSize, -1>(
             row_values + row.cells[c].position, row.block.size, f_block_size,
             z + lhs_row_layout_[r_block],
-            sj.data());
+            sj.get());
       }
 
       MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
           row_values + e_cell.position, row.block.size, e_block_size,
-          sj.data(),
+          sj.get(),
           y_ptr);
 
       MatrixTransposeMatrixMultiply
-          <kRowBlockSize, kEBlockSize,kRowBlockSize, kEBlockSize, 1>(
+          <kRowBlockSize, kEBlockSize, kRowBlockSize, kEBlockSize, 1>(
               row_values + e_cell.position, row.block.size, e_block_size,
               row_values + e_cell.position, row.block.size, e_block_size,
               ete.data(), 0, 0, e_block_size, e_block_size);
@@ -359,25 +369,25 @@
           const BlockSparseMatrixBase* A,
           const double* b,
           int row_block_counter,
-          const Vector& inverse_ete_g,
+          const double* inverse_ete_g,
           double* rhs) {
   const CompressedRowBlockStructure* bs = A->block_structure();
-  const int e_block_size = inverse_ete_g.rows();
+  const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
+  const int e_block_size = bs->cols[e_block_id].size;
+
   int b_pos = bs->rows[row_block_counter].block.position;
   for (int j = 0; j < chunk.size; ++j) {
     const CompressedRow& row = bs->rows[row_block_counter + j];
     const double *row_values = A->RowBlockValues(row_block_counter + j);
     const Cell& e_cell = row.cells.front();
 
-    const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef
-        e_block(row_values + e_cell.position,
-                row.block.size,
-                e_block_size);
-
-    const typename EigenTypes<kRowBlockSize>::Vector
-        sj =
+    typename EigenTypes<kRowBlockSize>::Vector sj =
         typename EigenTypes<kRowBlockSize>::ConstVectorRef
-         (b + b_pos, row.block.size) - e_block * (inverse_ete_g);
+        (b + b_pos, row.block.size);
+
+    MatrixVectorMultiply<kRowBlockSize, kEBlockSize, -1>(
+        row_values + e_cell.position, row.block.size, e_block_size,
+        inverse_ete_g, sj.data());
 
     for (int c = 1; c < row.cells.size(); ++c) {
       const int block_id = row.cells[c].block_id;
@@ -421,7 +431,7 @@
     const double* b,
     int row_block_counter,
     typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix* ete,
-    typename EigenTypes<kEBlockSize>::Vector* g,
+    double* g,
     double* buffer,
     BlockRandomAccessMatrix* lhs) {
   const CompressedRowBlockStructure* bs = A->block_structure();
@@ -453,7 +463,7 @@
     MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
         row_values + e_cell.position, row.block.size, e_block_size,
         b + b_pos,
-        g->data());
+        g);
 
 
     // buffer = E'F. This computation is done by iterating over the
@@ -550,10 +560,10 @@
       const int block_id = row.cells[c].block_id;
       const int block_size = bs->cols[block_id].size;
       const int block = block_id - num_eliminate_blocks_;
-      VectorRef(rhs + lhs_row_layout_[block], block_size).noalias()
-          += (ConstMatrixRef(row_values + row.cells[c].position,
-                             row.block.size, block_size).transpose() *
-              ConstVectorRef(b + row.block.position, row.block.size));
+      MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+          row_values + row.cells[c].position, row.block.size, block_size,
+          b + row.block.position,
+          rhs + lhs_row_layout_[block]);
     }
     NoEBlockRowOuterProduct(A, row_block_counter, lhs);
   }
@@ -588,8 +598,6 @@
     DCHECK_GE(block1, 0);
 
     const int block1_size = bs->cols[row.cells[i].block_id].size;
-    const ConstMatrixRef b1(row_values + row.cells[i].position,
-                            row.block.size, block1_size);
     int r, c, row_stride, col_stride;
     CellInfo* cell_info = lhs->GetCell(block1, block1,
                                        &r, &c,