Faster  BlockRandomAccessSparseMatrix::SymmetricRightMultiply.

Trade a small amount of memory to improve the cache coherency of
the SymmetricRightMultiply operation.

The resulting code leads to a 10-20% speedup in the linear solver
end to end.

Change-Id: I8ab2fe152099e849b211b5b19e4ef9f03d8e7f1c
diff --git a/internal/ceres/block_random_access_sparse_matrix.cc b/internal/ceres/block_random_access_sparse_matrix.cc
index 9da16a4..c43a9b7 100644
--- a/internal/ceres/block_random_access_sparse_matrix.cc
+++ b/internal/ceres/block_random_access_sparse_matrix.cc
@@ -88,6 +88,8 @@
        ++it) {
     const int row_block_size = blocks_[it->first];
     const int col_block_size = blocks_[it->second];
+    cell_values_.push_back(make_pair(make_pair(it->first, it->second),
+                                     values + pos));
     layout_[IntPairToLong(it->first, it->second)] =
         new CellInfo(values + pos);
     pos += row_block_size * col_block_size;
@@ -156,20 +158,19 @@
 
 void BlockRandomAccessSparseMatrix::SymmetricRightMultiply(const double* x,
                                                            double* y) const {
-  for (LayoutType::const_iterator it = layout_.begin();
-       it != layout_.end();
-       ++it) {
-    int row;
-    int col;
-    LongToIntPair(it->first, &row, &col);
-
+  vector< pair<pair<int, int>, double*> >::const_iterator it =
+      cell_values_.begin();
+  for (; it != cell_values_.end(); ++it) {
+    const int row = it->first.first;
     const int row_block_size = blocks_[row];
     const int row_block_pos = block_positions_[row];
+
+    const int col = it->first.second;
     const int col_block_size = blocks_[col];
     const int col_block_pos = block_positions_[col];
 
     MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
-        it->second->values, row_block_size, col_block_size,
+        it->second, row_block_size, col_block_size,
         x + col_block_pos,
         y + row_block_pos);
 
@@ -179,7 +180,7 @@
     // triangular multiply also.
     if (row != col) {
       MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
-          it->second->values, row_block_size, col_block_size,
+          it->second, row_block_size, col_block_size,
           x + row_block_pos,
           y + col_block_pos);
     }
diff --git a/internal/ceres/block_random_access_sparse_matrix.h b/internal/ceres/block_random_access_sparse_matrix.h
index a4f89d8..51b5d20 100644
--- a/internal/ceres/block_random_access_sparse_matrix.h
+++ b/internal/ceres/block_random_access_sparse_matrix.h
@@ -111,6 +111,10 @@
   typedef HashMap<long int, CellInfo* > LayoutType;
   LayoutType layout_;
 
+  // In order traversal of contents of the matrix. This allows us to
+  // implement a matrix-vector which is 20% faster than using the
+  // iterator in the Layout object instead.
+  vector<pair<pair<int, int>, double*> > cell_values_;
   // The underlying matrix object which actually stores the cells.
   scoped_ptr<TripletSparseMatrix> tsm_;