Minor cleanups.

1. Further BLAS and heap allocation cleanups in  schur_eliminator_impl.h
2. Modularize blas.h using macros.
3. Lint cleanups from William Rucklidge.
4. Small changes to jet.h
5. ResidualBlock now uses blas.h

Performance improvements:

For static and dynamic sized blocks, the peformance is not changed much.

-use_quaternions -ordering user -linear_solver sparse_schur

                                        master         change
problem: 16-22106
gcc                                        3.4            3.3
clang                                      2.8            2.7

problem: 49-7776
gcc                                        1.7            1.7
clang                                      1.4            1.4

problem: 245-198739
gcc                                       80.1           79.6
clang                                     80.6           76.2

problem: 257-65132
gcc                                       12.2           12.0
clang                                     10.4           10.2

problem: 356-226730
gcc                                       99.0           96.8
clang                                     88.9           88.3

problem: 744-543562
gcc                                      361.5          356.2
clang                                    352.7          343.5

problem: 1024-110968
gcc                                       45.9           45.6
clang                                     42.6           42.1

However, performance when using local parameterizations is
significantly improved due to residual_block.cc using blas.h

-use_quaternions -use_local_parameterization -ordering user -linear_solver sparse_schur

                                        master         change
problem: 16-22106
gcc                                        3.6            3.3
clang                                      3.5            2.8

problem: 49-7776
gcc                                        1.8            1.6
clang                                      1.7            1.4

problem: 245-198739
gcc                                       79.7           76.1
clang                                     79.7           73.0

problem: 257-65132
gcc                                       12.8           11.9
clang                                     12.3            9.8

problem: 356-226730
gcc                                      101.9           93.5
clang                                    105.0           86.8

problem: 744-543562
gcc                                      367.9          350.5
clang                                    355.3          323.1

problem: 1024-110968
gcc                                       43.0           40.3
clang                                     41.0           37.5

Change-Id: I6dcf7476ddaa77cb116558d112a9cf1e832f5fc9
diff --git a/include/ceres/jet.h b/include/ceres/jet.h
index 1238123..000bd1c 100644
--- a/include/ceres/jet.h
+++ b/include/ceres/jet.h
@@ -348,8 +348,8 @@
   //   b + v   (b + v)(b - v)        b^2
   //
   // which holds because v*v = 0.
-  h.a = f.a / g.a;
-  const T g_a_inverse = 1.0 / g.a;
+  const T g_a_inverse = T(1.0) / g.a;
+  h.a = f.a * g_a_inverse;
   const T f_a_by_g_a = f.a * g_a_inverse;
   for (int i = 0; i < N; ++i) {
     h.v[i] = (f.v[i] - f_a_by_g_a * g.v[i]) * g_a_inverse;
@@ -450,7 +450,7 @@
 Jet<T, N> sqrt(const Jet<T, N>& f) {
   Jet<T, N> g;
   g.a = sqrt(f.a);
-  const T two_a_inverse = 1.0 / (T(2.0) * g.a);
+  const T two_a_inverse = T(1.0) / (T(2.0) * g.a);
   g.v = f.v * two_a_inverse;
   return g;
 }
diff --git a/internal/ceres/blas.h b/internal/ceres/blas.h
index bacf1ff..9629b3d 100644
--- a/internal/ceres/blas.h
+++ b/internal/ceres/blas.h
@@ -65,20 +65,71 @@
 #define CERES_MAYBE_NOALIAS .noalias()
 #endif
 
-// For the matrix-matrix functions below, there are three functions
-// for each functionality. Foo, FooNaive and FooEigen. Foo is the one
-// to be called by the user. FooNaive is a basic loop based
+// The following three macros are used to share code and reduce
+// template junk across the various GEMM variants.
+#define CERES_GEMM_BEGIN(name)                                          \
+  template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>  \
+  inline void name(const double* A,                                     \
+                   const int num_row_a,                                 \
+                   const int num_col_a,                                 \
+                   const double* B,                                     \
+                   const int num_row_b,                                 \
+                   const int num_col_b,                                 \
+                   double* C,                                           \
+                   const int start_row_c,                               \
+                   const int start_col_c,                               \
+                   const int row_stride_c,                              \
+                   const int col_stride_c)
+
+#define CERES_GEMM_NAIVE_HEADER                                         \
+  DCHECK_GT(num_row_a, 0);                                              \
+  DCHECK_GT(num_col_a, 0);                                              \
+  DCHECK_GT(num_row_b, 0);                                              \
+  DCHECK_GT(num_col_b, 0);                                              \
+  DCHECK_GE(start_row_c, 0);                                            \
+  DCHECK_GE(start_col_c, 0);                                            \
+  DCHECK_GT(row_stride_c, 0);                                           \
+  DCHECK_GT(col_stride_c, 0);                                           \
+  DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a));            \
+  DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a));            \
+  DCHECK((kRowB == Eigen::Dynamic) || (kRowB == num_row_b));            \
+  DCHECK((kColB == Eigen::Dynamic) || (kColB == num_col_b));            \
+  const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a);  \
+  const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a);  \
+  const int NUM_ROW_B = (kColB != Eigen::Dynamic ? kRowB : num_row_b);  \
+  const int NUM_COL_B = (kColB != Eigen::Dynamic ? kColB : num_col_b);
+
+#define CERES_GEMM_EIGEN_HEADER                                         \
+  const typename EigenTypes<kRowA, kColA>::ConstMatrixRef               \
+  Aref(A, num_row_a, num_col_a);                                        \
+  const typename EigenTypes<kRowB, kColB>::ConstMatrixRef               \
+  Bref(B, num_row_b, num_col_b);                                        \
+  MatrixRef Cref(C, row_stride_c, col_stride_c);                        \
+
+#define CERES_CALL_GEMM(name)                                           \
+  name<kRowA, kColA, kRowB, kColB, kOperation>(                         \
+      A, num_row_a, num_col_a,                                          \
+      B, num_row_b, num_col_b,                                          \
+      C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+
+
+// For the matrix-matrix functions below, there are three variants for
+// each functionality. Foo, FooNaive and FooEigen. Foo is the one to
+// be called by the user. FooNaive is a basic loop based
 // implementation and FooEigen uses Eigen's implementation. Foo
 // chooses between FooNaive and FooEigen depending on how many of the
 // template arguments are fixed at compile time. Currently, FooEigen
-// is called if all matrix dimenions are compile time
+// is called if all matrix dimensions are compile time
 // constants. FooNaive is called otherwise. This leads to the best
 // performance currently.
 //
-// TODO(sameeragarwal): Benchmark and simplify the matrix-vector
-// functions.
-
-// C op A * B;
+// The MatrixMatrixMultiply variants compute:
+//
+//   C op A * B;
+//
+// The MatrixTransposeMatrixMultiply variants compute:
+//
+//   C op A' * B
 //
 // where op can be +=, -=, or =.
 //
@@ -91,7 +142,7 @@
 //   kOperation = -1  -> C -= A * B
 //   kOperation =  0  -> C  = A * B
 //
-// The function can write into matrices C which are larger than the
+// The functions can write into matrices C which are larger than the
 // matrix A * B. This is done by specifying the true size of C via
 // row_stride_c and col_stride_c, and then indicating where A * B
 // should be written into by start_row_c and start_col_c.
@@ -110,24 +161,11 @@
 //   ------------
 //   ------------
 //
-template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
-inline void MatrixMatrixMultiplyEigen(const double* A,
-                                      const int num_row_a,
-                                      const int num_col_a,
-                                      const double* B,
-                                      const int num_row_b,
-                                      const int num_col_b,
-                                      double* C,
-                                      const int start_row_c,
-                                      const int start_col_c,
-                                      const int row_stride_c,
-                                      const int col_stride_c) {
-  const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(A, num_row_a, num_col_a);
-  const typename EigenTypes<kRowB, kColB>::ConstMatrixRef Bref(B, num_row_b, num_col_b);
-  MatrixRef Cref(C, row_stride_c, col_stride_c);
-  Eigen::Block<MatrixRef, kRowA, kColB> block(Cref,
-                                              start_row_c, start_col_c,
-                                              num_row_a, num_col_b);
+CERES_GEMM_BEGIN(MatrixMatrixMultiplyEigen) {
+  CERES_GEMM_EIGEN_HEADER
+  Eigen::Block<MatrixRef, kRowA, kColB>
+    block(Cref, start_row_c, start_col_c, num_row_a, num_col_b);
+
   if (kOperation > 0) {
     block CERES_MAYBE_NOALIAS += Aref * Bref;
   } else if (kOperation < 0) {
@@ -137,36 +175,8 @@
   }
 }
 
-template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
-inline void MatrixMatrixMultiplyNaive(const double* A,
-                                      const int num_row_a,
-                                      const int num_col_a,
-                                      const double* B,
-                                      const int num_row_b,
-                                      const int num_col_b,
-                                      double* C,
-                                      const int start_row_c,
-                                      const int start_col_c,
-                                      const int row_stride_c,
-                                      const int col_stride_c) {
-  DCHECK_GT(num_row_a, 0);
-  DCHECK_GT(num_col_a, 0);
-  DCHECK_GT(num_row_b, 0);
-  DCHECK_GT(num_col_b, 0);
-  DCHECK_GE(start_row_c, 0);
-  DCHECK_GE(start_col_c, 0);
-  DCHECK_GT(row_stride_c, 0);
-  DCHECK_GT(col_stride_c, 0);
-
-  DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a));
-  DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a));
-  DCHECK((kRowB == Eigen::Dynamic) || (kRowB == num_row_b));
-  DCHECK((kColB == Eigen::Dynamic) || (kColB == num_col_b));
-
-  const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a);
-  const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a);
-  const int NUM_ROW_B = (kColB != Eigen::Dynamic ? kRowB : num_row_b);
-  const int NUM_COL_B = (kColB != Eigen::Dynamic ? kColB : num_col_b);
+CERES_GEMM_BEGIN(MatrixMatrixMultiplyNaive) {
+  CERES_GEMM_NAIVE_HEADER
   DCHECK_EQ(NUM_COL_A, NUM_ROW_B);
 
   const int NUM_ROW_C = NUM_ROW_A;
@@ -193,91 +203,26 @@
   }
 }
 
-template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
-inline void MatrixMatrixMultiply(const double* A,
-                                 const int num_row_a,
-                                 const int num_col_a,
-                                 const double* B,
-                                 const int num_row_b,
-                                 const int num_col_b,
-                                 double* C,
-                                 const int start_row_c,
-                                 const int start_col_c,
-                                 const int row_stride_c,
-                                 const int col_stride_c) {
+CERES_GEMM_BEGIN(MatrixMatrixMultiply) {
 #ifdef CERES_NO_CUSTOM_BLAS
-  MatrixMatrixMultiplyEigen<kRowA, kColA, kRowB, kColB, kOperation>(
-      A, num_row_a, num_col_a,
-      B, num_row_b, num_col_b,
-      C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+
+  CERES_CALL_GEMM(MatrixMatrixMultiplyEigen)
   return;
 
 #else
 
   if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
       kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
-    MatrixMatrixMultiplyEigen<kRowA, kColA, kRowB, kColB, kOperation>(
-        A, num_row_a, num_col_a,
-        B, num_row_b, num_col_b,
-        C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+    CERES_CALL_GEMM(MatrixMatrixMultiplyEigen)
   } else {
-    MatrixMatrixMultiplyNaive<kRowA, kColA, kRowB, kColB, kOperation>(
-        A, num_row_a, num_col_a,
-        B, num_row_b, num_col_b,
-        C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+    CERES_CALL_GEMM(MatrixMatrixMultiplyNaive)
   }
 
 #endif
 }
 
-
-// C op A' * B;
-//
-// where op can be +=, -=, or =.
-//
-// The template parameters (kRowA, kColA, kRowB, kColB) allow
-// specialization of the loop at compile time. If this information is
-// not available, then Eigen::Dynamic should be used as the template
-// argument.
-//
-//   kOperation =  1  -> C += A' * B
-//   kOperation = -1  -> C -= A' * B
-//   kOperation =  0  -> C  = A' * B
-//
-// The function can write into matrices C which are larger than the
-// matrix A' * B. This is done by specifying the true size of C via
-// row_stride_c and col_stride_c, and then indicating where A * B
-// should be written into by start_row_c and start_col_c.
-//
-// Graphically if row_stride_c = 10, col_stride_c = 12, start_row_c =
-// 4 and start_col_c = 5, then if A = 2x3 and B = 2x4, we get
-//
-//   ------------
-//   ------------
-//   ------------
-//   ------------
-//   -----xxxx---
-//   -----xxxx---
-//   -----xxxx---
-//   ------------
-//   ------------
-//   ------------
-//
-template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
-inline void MatrixTransposeMatrixMultiplyEigen(const double* A,
-                                               const int num_row_a,
-                                               const int num_col_a,
-                                               const double* B,
-                                               const int num_row_b,
-                                               const int num_col_b,
-                                               double* C,
-                                               const int start_row_c,
-                                               const int start_col_c,
-                                               const int row_stride_c,
-                                               const int col_stride_c) {
-  const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(A, num_row_a, num_col_a);
-  const typename EigenTypes<kRowB, kColB>::ConstMatrixRef Bref(B, num_row_b, num_col_b);
-  MatrixRef Cref(C, row_stride_c, col_stride_c);
+CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiplyEigen) {
+  CERES_GEMM_EIGEN_HEADER
   Eigen::Block<MatrixRef, kColA, kColB> block(Cref,
                                               start_row_c, start_col_c,
                                               num_col_a, num_col_b);
@@ -290,36 +235,8 @@
   }
 }
 
-template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
-inline void MatrixTransposeMatrixMultiplyNaive(const double* A,
-                                               const int num_row_a,
-                                               const int num_col_a,
-                                               const double* B,
-                                               const int num_row_b,
-                                               const int num_col_b,
-                                               double* C,
-                                               const int start_row_c,
-                                               const int start_col_c,
-                                               const int row_stride_c,
-                                               const int col_stride_c) {
-  DCHECK_GT(num_row_a, 0);
-  DCHECK_GT(num_col_a, 0);
-  DCHECK_GT(num_row_b, 0);
-  DCHECK_GT(num_col_b, 0);
-  DCHECK_GE(start_row_c, 0);
-  DCHECK_GE(start_col_c, 0);
-  DCHECK_GT(row_stride_c, 0);
-  DCHECK_GT(col_stride_c, 0);
-
-  DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a));
-  DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a));
-  DCHECK((kRowB == Eigen::Dynamic) || (kRowB == num_row_b));
-  DCHECK((kColB == Eigen::Dynamic) || (kColB == num_col_b));
-
-  const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a);
-  const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a);
-  const int NUM_ROW_B = (kColB != Eigen::Dynamic ? kRowB : num_row_b);
-  const int NUM_COL_B = (kColB != Eigen::Dynamic ? kColB : num_col_b);
+CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiplyNaive) {
+  CERES_GEMM_NAIVE_HEADER
   DCHECK_EQ(NUM_ROW_A, NUM_ROW_B);
 
   const int NUM_ROW_C = NUM_COL_A;
@@ -346,43 +263,37 @@
   }
 }
 
-template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
-inline void MatrixTransposeMatrixMultiply(const double* A,
-                                          const int num_row_a,
-                                          const int num_col_a,
-                                          const double* B,
-                                          const int num_row_b,
-                                          const int num_col_b,
-                                          double* C,
-                                          const int start_row_c,
-                                          const int start_col_c,
-                                          const int row_stride_c,
-                                          const int col_stride_c) {
+CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiply) {
 #ifdef CERES_NO_CUSTOM_BLAS
-  MatrixTransposeMatrixMultiplyEigen<kRowA, kColA, kRowB, kColB, kOperation>(
-      A, num_row_a, num_col_a,
-      B, num_row_b, num_col_b,
-      C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+
+  CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyEigen)
   return;
 
 #else
 
   if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
       kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
-    MatrixTransposeMatrixMultiplyEigen<kRowA, kColA, kRowB, kColB, kOperation>(
-        A, num_row_a, num_col_a,
-        B, num_row_b, num_col_b,
-        C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+    CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyEigen)
   } else {
-    MatrixTransposeMatrixMultiplyNaive<kRowA, kColA, kRowB, kColB, kOperation>(
-        A, num_row_a, num_col_a,
-        B, num_row_b, num_col_b,
-        C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+    CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyNaive)
   }
 
 #endif
 }
 
+// Matrix-Vector multiplication
+//
+// c op A * b;
+//
+// where op can be +=, -=, or =.
+//
+// The template parameters (kRowA, kColA) allow specialization of the
+// loop at compile time. If this information is not available, then
+// Eigen::Dynamic should be used as the template argument.
+//
+// kOperation =  1  -> c += A' * b
+// kOperation = -1  -> c -= A' * b
+// kOperation =  0  -> c  = A' * b
 template<int kRowA, int kColA, int kOperation>
 inline void MatrixVectorMultiply(const double* A,
                                  const int num_row_a,
@@ -390,7 +301,8 @@
                                  const double* b,
                                  double* c) {
 #ifdef CERES_NO_CUSTOM_BLAS
-  const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(A, num_row_a, num_col_a);
+  const typename EigenTypes<kRowA, kColA>::ConstMatrixRef
+      Aref(A, num_row_a, num_col_a);
   const typename EigenTypes<kColA>::ConstVectorRef bref(b, num_col_a);
   typename EigenTypes<kRowA>::VectorRef cref(c, num_row_a);
 
@@ -430,17 +342,9 @@
 #endif  // CERES_NO_CUSTOM_BLAS
 }
 
+// Similar to MatrixVectorMultiply, except that A is transposed, i.e.,
+//
 // c op A' * b;
-//
-// where op can be +=, -=, or =.
-//
-// The template parameters (kRowA, kColA) allow specialization of the
-// loop at compile time. If this information is not available, then
-// Eigen::Dynamic should be used as the template argument.
-//
-// kOperation =  1  -> c += A' * b
-// kOperation = -1  -> c -= A' * b
-// kOperation =  0  -> c  = A' * b
 template<int kRowA, int kColA, int kOperation>
 inline void MatrixTransposeVectorMultiply(const double* A,
                                           const int num_row_a,
@@ -448,7 +352,8 @@
                                           const double* b,
                                           double* c) {
 #ifdef CERES_NO_CUSTOM_BLAS
-  const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(A, num_row_a, num_col_a);
+  const typename EigenTypes<kRowA, kColA>::ConstMatrixRef
+      Aref(A, num_row_a, num_col_a);
   const typename EigenTypes<kRowA>::ConstVectorRef bref(b, num_row_a);
   typename EigenTypes<kColA>::VectorRef cref(c, num_col_a);
 
@@ -488,7 +393,12 @@
 #endif  // CERES_NO_CUSTOM_BLAS
 }
 
+
 #undef CERES_MAYBE_NOALIAS
+#undef CERES_GEMM_BEGIN
+#undef CERES_GEMM_EIGEN_HEADER
+#undef CERES_GEMM_NAIVE_HEADER
+#undef CERES_CALL_GEMM
 
 }  // namespace internal
 }  // namespace ceres
diff --git a/internal/ceres/block_sparse_matrix.cc b/internal/ceres/block_sparse_matrix.cc
index ab6fcef..ae36d60 100644
--- a/internal/ceres/block_sparse_matrix.cc
+++ b/internal/ceres/block_sparse_matrix.cc
@@ -147,7 +147,6 @@
           values_.get() + cells[j].position, row_block_size, col_block_size,
           x + row_block_pos,
           y + col_block_pos);
-
     }
   }
 }
diff --git a/internal/ceres/residual_block.cc b/internal/ceres/residual_block.cc
index b91b0ed..4906607 100644
--- a/internal/ceres/residual_block.cc
+++ b/internal/ceres/residual_block.cc
@@ -35,6 +35,7 @@
 #include <cstddef>
 #include <vector>
 
+#include "ceres/blas.h"
 #include "ceres/corrector.h"
 #include "ceres/parameter_block.h"
 #include "ceres/residual_block_utils.h"
@@ -139,17 +140,15 @@
 
         // Apply local reparameterization to the jacobians.
         if (parameter_block->LocalParameterizationJacobian() != NULL) {
-          ConstMatrixRef local_to_global(
+          MatrixMatrixMultiply
+              <Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, 0>(
+              global_jacobians[i],
+              num_residuals,
+              parameter_block->Size(),
               parameter_block->LocalParameterizationJacobian(),
               parameter_block->Size(),
-              parameter_block->LocalSize());
-          MatrixRef global_jacobian(global_jacobians[i],
-                                    num_residuals,
-                                    parameter_block->Size());
-          MatrixRef local_jacobian(jacobians[i],
-                                   num_residuals,
-                                   parameter_block->LocalSize());
-          local_jacobian.noalias() = global_jacobian * local_to_global;
+              parameter_block->LocalSize(),
+              jacobians[i], 0, 0,  num_residuals, parameter_block->LocalSize());
         }
       }
     }
diff --git a/internal/ceres/schur_eliminator.h b/internal/ceres/schur_eliminator.h
index e0c7fda..f2c247a 100644
--- a/internal/ceres/schur_eliminator.h
+++ b/internal/ceres/schur_eliminator.h
@@ -277,7 +277,7 @@
       const double* b,
       int row_block_counter,
       typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix* eet,
-      typename EigenTypes<kEBlockSize>::Vector* g,
+      double* g,
       double* buffer,
       BlockRandomAccessMatrix* lhs);
 
@@ -285,7 +285,7 @@
                  const BlockSparseMatrixBase* A,
                  const double* b,
                  int row_block_counter,
-                 const Vector& inverse_ete_g,
+                 const double* inverse_ete_g,
                  double* rhs);
 
   void ChunkOuterProduct(const CompressedRowBlockStructure* bs,
@@ -336,7 +336,7 @@
   // ChunkOuterProduct. Like buffer_ it is of size num_threads *
   // buffer_size_. Each thread accesses the chunk
   //
-  //   [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_]
+  //   [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_ -1]
   //
   scoped_array<double> chunk_outer_product_buffer_;
 
diff --git a/internal/ceres/schur_eliminator_impl.h b/internal/ceres/schur_eliminator_impl.h
index b46eab9..5afb2f2 100644
--- a/internal/ceres/schur_eliminator_impl.h
+++ b/internal/ceres/schur_eliminator_impl.h
@@ -51,16 +51,18 @@
 
 #include <algorithm>
 #include <map>
-#include "Eigen/Dense"
+
 #include "ceres/blas.h"
 #include "ceres/block_random_access_matrix.h"
 #include "ceres/block_sparse_matrix.h"
 #include "ceres/block_structure.h"
 #include "ceres/internal/eigen.h"
+#include "ceres/internal/fixed_array.h"
 #include "ceres/internal/scoped_ptr.h"
 #include "ceres/map_util.h"
 #include "ceres/schur_eliminator.h"
 #include "ceres/stl_util.h"
+#include "Eigen/Dense"
 #include "glog/logging.h"
 
 namespace ceres {
@@ -240,8 +242,9 @@
       ete.setZero();
     }
 
-    typename EigenTypes<kEBlockSize>::Vector g(e_block_size);
-    g.setZero();
+    FixedArray<double, 8> g(e_block_size);
+    typename EigenTypes<kEBlockSize>::VectorRef gref(g.get(), e_block_size);
+    gref.setZero();
 
     // We are going to be computing
     //
@@ -256,7 +259,7 @@
     // in this chunk (g) and add the outer product of the f_blocks to
     // Schur complement (S += F'F).
     ChunkDiagonalBlockAndGradient(
-        chunk, A, b, chunk.start, &ete, &g, buffer, lhs);
+        chunk, A, b, chunk.start, &ete, g.get(), buffer, lhs);
 
     // Normally one wouldn't compute the inverse explicitly, but
     // e_block_size will typically be a small number like 3, in
@@ -273,7 +276,12 @@
     // linear system.
     //
     //   rhs = F'b - F'E(E'E)^(-1) E'b
-    UpdateRhs(chunk, A, b, chunk.start, inverse_ete * g, rhs);
+
+    FixedArray<double, 8> inverse_ete_g(e_block_size);
+    MatrixVectorMultiply<kEBlockSize, kEBlockSize, 0>(
+        inverse_ete.data(), e_block_size, e_block_size, g.get(), inverse_ete_g.get());
+
+    UpdateRhs(chunk, A, b, chunk.start, inverse_ete_g.get(), rhs);
 
     // S -= F'E(E'E)^{-1}E'F
     ChunkOuterProduct(bs, inverse_ete, buffer, chunk.buffer_layout, lhs);
@@ -318,7 +326,9 @@
       const Cell& e_cell = row.cells.front();
       DCHECK_EQ(e_block_id, e_cell.block_id);
 
-      typename EigenTypes<kRowBlockSize>::Vector sj =
+      FixedArray<double, 8> sj(row.block.size);
+
+      typename EigenTypes<kRowBlockSize>::VectorRef(sj.get(), row.block.size) =
           typename EigenTypes<kRowBlockSize>::ConstVectorRef
           (b + bs->rows[chunk.start + j].block.position, row.block.size);
 
@@ -330,16 +340,16 @@
         MatrixVectorMultiply<kRowBlockSize, kFBlockSize, -1>(
             row_values + row.cells[c].position, row.block.size, f_block_size,
             z + lhs_row_layout_[r_block],
-            sj.data());
+            sj.get());
       }
 
       MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
           row_values + e_cell.position, row.block.size, e_block_size,
-          sj.data(),
+          sj.get(),
           y_ptr);
 
       MatrixTransposeMatrixMultiply
-          <kRowBlockSize, kEBlockSize,kRowBlockSize, kEBlockSize, 1>(
+          <kRowBlockSize, kEBlockSize, kRowBlockSize, kEBlockSize, 1>(
               row_values + e_cell.position, row.block.size, e_block_size,
               row_values + e_cell.position, row.block.size, e_block_size,
               ete.data(), 0, 0, e_block_size, e_block_size);
@@ -359,25 +369,25 @@
           const BlockSparseMatrixBase* A,
           const double* b,
           int row_block_counter,
-          const Vector& inverse_ete_g,
+          const double* inverse_ete_g,
           double* rhs) {
   const CompressedRowBlockStructure* bs = A->block_structure();
-  const int e_block_size = inverse_ete_g.rows();
+  const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
+  const int e_block_size = bs->cols[e_block_id].size;
+
   int b_pos = bs->rows[row_block_counter].block.position;
   for (int j = 0; j < chunk.size; ++j) {
     const CompressedRow& row = bs->rows[row_block_counter + j];
     const double *row_values = A->RowBlockValues(row_block_counter + j);
     const Cell& e_cell = row.cells.front();
 
-    const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef
-        e_block(row_values + e_cell.position,
-                row.block.size,
-                e_block_size);
-
-    const typename EigenTypes<kRowBlockSize>::Vector
-        sj =
+    typename EigenTypes<kRowBlockSize>::Vector sj =
         typename EigenTypes<kRowBlockSize>::ConstVectorRef
-         (b + b_pos, row.block.size) - e_block * (inverse_ete_g);
+        (b + b_pos, row.block.size);
+
+    MatrixVectorMultiply<kRowBlockSize, kEBlockSize, -1>(
+        row_values + e_cell.position, row.block.size, e_block_size,
+        inverse_ete_g, sj.data());
 
     for (int c = 1; c < row.cells.size(); ++c) {
       const int block_id = row.cells[c].block_id;
@@ -421,7 +431,7 @@
     const double* b,
     int row_block_counter,
     typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix* ete,
-    typename EigenTypes<kEBlockSize>::Vector* g,
+    double* g,
     double* buffer,
     BlockRandomAccessMatrix* lhs) {
   const CompressedRowBlockStructure* bs = A->block_structure();
@@ -453,7 +463,7 @@
     MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
         row_values + e_cell.position, row.block.size, e_block_size,
         b + b_pos,
-        g->data());
+        g);
 
 
     // buffer = E'F. This computation is done by iterating over the
@@ -550,10 +560,10 @@
       const int block_id = row.cells[c].block_id;
       const int block_size = bs->cols[block_id].size;
       const int block = block_id - num_eliminate_blocks_;
-      VectorRef(rhs + lhs_row_layout_[block], block_size).noalias()
-          += (ConstMatrixRef(row_values + row.cells[c].position,
-                             row.block.size, block_size).transpose() *
-              ConstVectorRef(b + row.block.position, row.block.size));
+      MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+          row_values + row.cells[c].position, row.block.size, block_size,
+          b + row.block.position,
+          rhs + lhs_row_layout_[block]);
     }
     NoEBlockRowOuterProduct(A, row_block_counter, lhs);
   }
@@ -588,8 +598,6 @@
     DCHECK_GE(block1, 0);
 
     const int block1_size = bs->cols[row.cells[i].block_id].size;
-    const ConstMatrixRef b1(row_values + row.cells[i].position,
-                            row.block.size, block1_size);
     int r, c, row_stride, col_stride;
     CellInfo* cell_info = lhs->GetCell(block1, block1,
                                        &r, &c,