Replace Eigen block operations with small GEMM and GEMV loops.

1. Add Matrix-Matrix and Matrix-Vector multiply functions.
2. Replace Eigen usage in SchurEliminator with these custom
   matrix operations.
3. Save on some memory allocations in ChunkOuterProduct.
4. Replace LDLT with LLT.

As a result on problem-16-22106-pre.txt, the linear solver time
goes down from 1.2s to 0.64s.

Change-Id: I2daa667960e0a1e8834489965a30be31f37fd87f
diff --git a/internal/ceres/schur_eliminator.h b/internal/ceres/schur_eliminator.h
index 877621b..e0c7fda 100644
--- a/internal/ceres/schur_eliminator.h
+++ b/internal/ceres/schur_eliminator.h
@@ -321,9 +321,25 @@
   // see the documentation of the Chunk object above.
   vector<Chunk> chunks_;
 
+  // TODO(sameeragarwal): The following two arrays contain per-thread
+  // storage. They should be refactored into a per thread struct.
+
   // Buffer to store the products of the y and z blocks generated
-  // during the elimination phase.
+  // during the elimination phase. buffer_ is of size num_threads *
+  // buffer_size_. Each thread accesses the chunk
+  //
+  //   [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_]
+  //
   scoped_array<double> buffer_;
+
+  // Buffer to store per thread matrix matrix products used by
+  // ChunkOuterProduct. Like buffer_ it is of size num_threads *
+  // buffer_size_. Each thread accesses the chunk
+  //
+  //   [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_]
+  //
+  scoped_array<double> chunk_outer_product_buffer_;
+
   int buffer_size_;
   int num_threads_;
   int uneliminated_row_begins_;