Optimize J' * J in sparse_normal_cholesky_solver.

1. Add stype to the outerproduct computation to control the output
matrix in upper or lower triangular matrix. For SuiteSparse,
upper triangular matrix is generated. SuiteSparse can directly use
this matrix format for cholesky without matrix transpose overhead.

2. Change the outerproduct computation to block multiplication.  This
reduces the computation complexity for the sort in preprocessing, also
allows formulation of the block outerproduct computation as dense Eigen
block matrix multiplication.

3. Solve 32 Tango problems on Qualcomm MSM8994 Cortex-A53 (1.55GHz)
   before change: 140 seconds
   after change: 131 seconds

Change-Id: I8054114cef911de6a303310a448821ca296e4744
diff --git a/internal/ceres/compressed_row_jacobian_writer.cc b/internal/ceres/compressed_row_jacobian_writer.cc
index 40977b7..6ee7226 100644
--- a/internal/ceres/compressed_row_jacobian_writer.cc
+++ b/internal/ceres/compressed_row_jacobian_writer.cc
@@ -121,6 +121,13 @@
   // seems to be the only way to construct it without doing a memory copy.
   int* rows = jacobian->mutable_rows();
   int* cols = jacobian->mutable_cols();
+
+  // Initialize crsb rows and cols.
+  std::vector<int>& crsb_rows = *jacobian->mutable_crsb_rows();
+  std::vector<int>& crsb_cols = *jacobian->mutable_crsb_cols();
+  crsb_rows.resize(residual_blocks.size() + 1);
+  crsb_rows[0] = 0;
+
   int row_pos = 0;
   rows[0] = 0;
   for (int i = 0; i < residual_blocks.size(); ++i) {
@@ -157,6 +164,11 @@
                  << "Parameter Blocks: " << parameter_block_description;
     }
 
+    // Populate crsb rows and cols.
+    crsb_rows[i + 1] = crsb_rows[i] + parameter_indices.size();
+    std::copy(parameter_indices.begin(), parameter_indices.end(),
+              std::back_inserter(crsb_cols));
+
     // Update the row indices.
     const int num_residuals = residual_block->NumResiduals();
     for (int j = 0; j < num_residuals; ++j) {
diff --git a/internal/ceres/compressed_row_sparse_matrix.cc b/internal/ceres/compressed_row_sparse_matrix.cc
index 91d18bb..5d56612 100644
--- a/internal/ceres/compressed_row_sparse_matrix.cc
+++ b/internal/ceres/compressed_row_sparse_matrix.cc
@@ -69,6 +69,39 @@
   const int* cols;
 };
 
+void TransposeForCompressedRowSparseStructure(const int num_rows,
+                                              const int num_cols,
+                                              const int num_nonzeros,
+                                              const int *rows,
+                                              const int *cols,
+                                              const double *values,
+                                              int* transpose_rows,
+                                              int *transpose_cols,
+                                              double *transpose_values) {
+  for (int idx = 0; idx < num_nonzeros; ++idx) {
+    ++transpose_rows[cols[idx] + 1];
+  }
+
+  for (int i = 1; i < num_cols + 1; ++i) {
+    transpose_rows[i] += transpose_rows[i - 1];
+  }
+
+  for (int r = 0; r < num_rows; ++r) {
+    for (int idx = rows[r]; idx < rows[r + 1]; ++idx) {
+      const int c = cols[idx];
+      const int transpose_idx = transpose_rows[c]++;
+      transpose_cols[transpose_idx] = r;
+      if (values) {
+         transpose_values[transpose_idx] = values[idx];
+      }
+    }
+  }
+
+  for (int i = num_cols - 1; i > 0 ; --i) {
+    transpose_rows[i] = transpose_rows[i - 1];
+  }
+  transpose_rows[0] = 0;
+}
 }  // namespace
 
 // This constructor gives you a semi-initialized CompressedRowSparseMatrix.
@@ -220,6 +253,16 @@
   num_rows_ -= delta_rows;
   rows_.resize(num_rows_ + 1);
 
+  // The rest of the code update block information.
+  // Immediately return in case of no block information.
+  if (row_blocks_.empty()) {
+    return;
+  }
+
+  // Sanity check for compressed row sparse block information
+  CHECK_EQ(crsb_rows_.size(), row_blocks_.size() + 1);
+  CHECK_EQ(crsb_rows_.back(), crsb_cols_.size());
+
   // Walk the list of row blocks until we reach the new number of rows
   // and the drop the rest of the row blocks.
   int num_row_blocks = 0;
@@ -230,12 +273,18 @@
   }
 
   row_blocks_.resize(num_row_blocks);
+
+  // Update compressed row sparse block (crsb) information.
+  CHECK_EQ(num_rows, num_rows_);
+  crsb_rows_.resize(num_row_blocks + 1);
+  crsb_cols_.resize(crsb_rows_[num_row_blocks]);
 }
 
 void CompressedRowSparseMatrix::AppendRows(const CompressedRowSparseMatrix& m) {
   CHECK_EQ(m.num_cols(), num_cols_);
 
-  CHECK(row_blocks_.size() == 0 || m.row_blocks().size() !=0)
+  CHECK((row_blocks_.size() == 0 && m.row_blocks().size() == 0) ||
+        (row_blocks_.size() != 0 && m.row_blocks().size() != 0))
       << "Cannot append a matrix with row blocks to one without and vice versa."
       << "This matrix has : " << row_blocks_.size() << " row blocks."
       << "The matrix being appended has: " << m.row_blocks().size()
@@ -270,9 +319,38 @@
   }
 
   num_rows_ += m.num_rows();
+
+  // The rest of the code update block information.
+  // Immediately return in case of no block information.
+  if (row_blocks_.empty()) {
+    return;
+  }
+
+  // Sanity check for compressed row sparse block information
+  CHECK_EQ(crsb_rows_.size(), row_blocks_.size() + 1);
+  CHECK_EQ(crsb_rows_.back(), crsb_cols_.size());
+
   row_blocks_.insert(row_blocks_.end(),
                      m.row_blocks().begin(),
                      m.row_blocks().end());
+
+  // The rest of the code update compressed row sparse block (crsb) information.
+  const int num_crsb_nonzeros = crsb_cols_.size();
+  const int m_num_crsb_nonzeros = m.crsb_cols_.size();
+  crsb_cols_.resize(num_crsb_nonzeros + m_num_crsb_nonzeros);
+  std::copy(&m.crsb_cols()[0], &m.crsb_cols()[0] + m_num_crsb_nonzeros,
+            &crsb_cols_[num_crsb_nonzeros]);
+
+  const int num_crsb_rows = crsb_rows_.size() - 1;
+  const int m_num_crsb_rows = m.crsb_rows_.size() - 1;
+  crsb_rows_.resize(num_crsb_rows + m_num_crsb_rows + 1);
+  std::fill(crsb_rows_.begin() + num_crsb_rows,
+            crsb_rows_.begin() + num_crsb_rows + m_num_crsb_rows + 1,
+            crsb_rows_[num_crsb_rows]);
+
+  for (int r = 0; r < m_num_crsb_rows + 1; ++r) {
+    crsb_rows_[num_crsb_rows + r] += m.crsb_rows()[r];
+  }
 }
 
 void CompressedRowSparseMatrix::ToTextFile(FILE* file) const {
@@ -364,6 +442,15 @@
   *matrix->mutable_row_blocks() = blocks;
   *matrix->mutable_col_blocks() = blocks;
 
+  // Fill compressed row sparse block (crsb) information.
+  vector<int>& crsb_rows = *matrix->mutable_crsb_rows();
+  vector<int>& crsb_cols = *matrix->mutable_crsb_cols();
+  for (int i = 0; i < blocks.size(); ++i) {
+    crsb_rows.push_back(i);
+    crsb_cols.push_back(i);
+  }
+  crsb_rows.push_back(blocks.size());
+
   CHECK_EQ(idx_cursor, num_nonzeros);
   CHECK_EQ(col_cursor, num_rows);
   return matrix;
@@ -373,40 +460,44 @@
   CompressedRowSparseMatrix* transpose =
       new CompressedRowSparseMatrix(num_cols_, num_rows_, num_nonzeros());
 
-  int* transpose_rows = transpose->mutable_rows();
-  int* transpose_cols = transpose->mutable_cols();
-  double* transpose_values = transpose->mutable_values();
+  TransposeForCompressedRowSparseStructure(
+      num_rows(), num_cols(), num_nonzeros(),
+      rows(), cols(), values(),
+      transpose->mutable_rows(),
+      transpose->mutable_cols(),
+      transpose->mutable_values());
 
-  for (int idx = 0; idx < num_nonzeros(); ++idx) {
-    ++transpose_rows[cols_[idx] + 1];
+  // The rest of the code update block information.
+  // Immediately return in case of no block information.
+  if (row_blocks_.empty()) {
+    return transpose;
   }
 
-  for (int i = 1; i < transpose->num_rows() + 1; ++i) {
-    transpose_rows[i] += transpose_rows[i - 1];
-  }
-
-  for (int r = 0; r < num_rows(); ++r) {
-    for (int idx = rows_[r]; idx < rows_[r + 1]; ++idx) {
-      const int c = cols_[idx];
-      const int transpose_idx = transpose_rows[c]++;
-      transpose_cols[transpose_idx] = r;
-      transpose_values[transpose_idx] = values_[idx];
-    }
-  }
-
-  for (int i = transpose->num_rows() - 1; i > 0 ; --i) {
-    transpose_rows[i] = transpose_rows[i - 1];
-  }
-  transpose_rows[0] = 0;
+  // Sanity check for compressed row sparse block information
+  CHECK_EQ(crsb_rows_.size(), row_blocks_.size() + 1);
+  CHECK_EQ(crsb_rows_.back(), crsb_cols_.size());
 
   *(transpose->mutable_row_blocks()) = col_blocks_;
   *(transpose->mutable_col_blocks()) = row_blocks_;
 
+  // The rest of the code update compressed row sparse block (crsb) information.
+  vector<int>& transpose_crsb_rows = *transpose->mutable_crsb_rows();
+  vector<int>& transpose_crsb_cols = *transpose->mutable_crsb_cols();
+
+  transpose_crsb_rows.resize(col_blocks_.size() + 1);
+  std::fill(transpose_crsb_rows.begin(), transpose_crsb_rows.end(), 0);
+  transpose_crsb_cols.resize(crsb_cols_.size());
+
+  TransposeForCompressedRowSparseStructure(
+      row_blocks().size(), col_blocks().size(), crsb_cols().size(),
+      crsb_rows().data(), crsb_cols().data(), NULL,
+      transpose_crsb_rows.data(), transpose_crsb_cols.data(), NULL);
+
   return transpose;
 }
 
 namespace {
-// A ProductTerm is a term in the outer product of a matrix with
+// A ProductTerm is a term in the block outer product of a matrix with
 // itself.
 struct ProductTerm {
   ProductTerm(const int row, const int col, const int index)
@@ -428,72 +519,176 @@
   int index;
 };
 
+// Create outerproduct matrix based on the block product information.
+// The input block product is already sorted.  This function does not
+// set the sparse rows/cols information.  Instead, it only collects the
+// nonzeros for each compressed row and puts in row_nnz.
+// The caller of this function will traverse the block product in a second
+// round to generate the sparse rows/cols information.
+// This function also computes the block offset information for
+// the outerproduct matrix, which is used in outer product computation.
 CompressedRowSparseMatrix*
-CompressAndFillProgram(const int num_rows,
-                       const int num_cols,
-                       const vector<ProductTerm>& product,
-                       vector<int>* program) {
-  CHECK_GT(product.size(), 0);
+CreateOuterProductMatrix(const int num_cols,
+                         const vector<int>& blocks,
+                         const vector<ProductTerm>& product,
+                         vector<int>* row_nnz) {
 
   // Count the number of unique product term, which in turn is the
   // number of non-zeros in the outer product.
-  int num_nonzeros = 1;
+  // Also count the number of non-zeros in each row.
+  row_nnz->resize(blocks.size());
+  std::fill(row_nnz->begin(), row_nnz->end(), 0);
+  (*row_nnz)[product[0].row] = blocks[product[0].col];
+  int num_nonzeros = blocks[product[0].row] * blocks[product[0].col];
   for (int i = 1; i < product.size(); ++i) {
+    // Each (row, col) block counts only once.
+    // This check depends on product sorted on (row, col).
     if (product[i].row != product[i - 1].row ||
         product[i].col != product[i - 1].col) {
-      ++num_nonzeros;
+      (*row_nnz)[product[i].row] += blocks[product[i].col];
+      num_nonzeros += blocks[product[i].row] * blocks[product[i].col];
     }
   }
 
   CompressedRowSparseMatrix* matrix =
-      new CompressedRowSparseMatrix(num_rows, num_cols, num_nonzeros);
+      new CompressedRowSparseMatrix(num_cols, num_cols, num_nonzeros);
+
+  // Compute block offsets for outer product matrix, which is used
+  // in ComputeOuterProduct.
+  vector<int>* block_offsets = matrix->mutable_block_offsets();
+  block_offsets->resize(blocks.size() + 1);
+  (*block_offsets)[0] = 0;
+  for (int i = 0; i < blocks.size(); ++i) {
+    (*block_offsets)[i + 1] = (*block_offsets)[i] + blocks[i];
+  }
+
+  return matrix;
+}
+
+CompressedRowSparseMatrix*
+CompressAndFillProgram(const int num_cols,
+                       const vector<int>& blocks,
+                       const vector<ProductTerm>& product,
+                       vector<int>* program) {
+  CHECK_GT(product.size(), 0);
+
+  vector<int> row_nnz;
+  CompressedRowSparseMatrix* matrix =
+      CreateOuterProductMatrix(num_cols, blocks, product, &row_nnz);
+
+  const vector<int>& block_offsets = matrix->block_offsets();
 
   int* crsm_rows = matrix->mutable_rows();
-  std::fill(crsm_rows, crsm_rows + num_rows + 1, 0);
+  std::fill(crsm_rows, crsm_rows + num_cols + 1, 0);
   int* crsm_cols = matrix->mutable_cols();
-  std::fill(crsm_cols, crsm_cols + num_nonzeros, 0);
+  std::fill(crsm_cols, crsm_cols + matrix->num_nonzeros(), 0);
 
   CHECK_NOTNULL(program)->clear();
   program->resize(product.size());
 
-  // Iterate over the sorted product terms. This means each row is
-  // filled one at a time, and we are able to assign a position in the
-  // values array to each term.
+  // Non zero elements are not stored consecutively across rows in a block.
+  // We seperate nonzero into three categories:
+  //   nonzeros in all previous row blocks counted in nnz
+  //   nonzeros in current row counted in row_nnz
+  //   nonzeros in previous col blocks of current row counted in col_nnz
   //
-  // If terms repeat, i.e., they contribute to the same entry in the
-  // result matrix), then they do not affect the sparsity structure of
-  // the result matrix.
+  // Give an element (j, k) within a block such that j and k
+  // represent the relative position to the starting row and starting col of
+  // the block, the row and col for the element is
+  //   block_offsets[current.row] + j
+  //   block_offsets[current.col] + k
+  // The total number of nonzero to the element is
+  //   nnz + row_nnz[current.row] * j + col_nnz + k
+  //
+  // program keeps col_nnz for block product, which is used later for
+  // outerproduct computation.
+  //
+  // There is no special handling for diagonal blocks as we generate
+  // BLOCK triangular matrix (diagonal block is full block) instead of
+  // standard triangular matrix.
   int nnz = 0;
-  crsm_cols[0] = product[0].col;
-  crsm_rows[product[0].row + 1]++;
-  (*program)[product[0].index] = nnz;
+  int col_nnz = 0;
+
+  // Process first product term.
+  for (int j = 0; j < blocks[product[0].row]; ++j) {
+    crsm_rows[block_offsets[product[0].row] + j + 1] =
+        row_nnz[product[0].row];
+    for (int k = 0; k < blocks[product[0].col]; ++k) {
+      crsm_cols[row_nnz[product[0].row] * j + k]
+          = block_offsets[product[0].col] + k;
+    }
+  }
+
+  (*program)[product[0].index] = 0;
+
+  // Process rest product terms.
   for (int i = 1; i < product.size(); ++i) {
     const ProductTerm& previous = product[i - 1];
     const ProductTerm& current = product[i];
 
     // Sparsity structure is updated only if the term is not a repeat.
     if (previous.row != current.row || previous.col != current.col) {
-      crsm_cols[++nnz] = current.col;
-      crsm_rows[current.row + 1]++;
+      col_nnz += blocks[previous.col];
+      if (previous.row != current.row) {
+        nnz += col_nnz * blocks[previous.row];
+        col_nnz = 0;
+
+        for (int j = 0; j < blocks[current.row]; ++j) {
+          crsm_rows[block_offsets[current.row] + j + 1] =
+              row_nnz[current.row];
+        }
+      }
+
+      for (int j = 0; j < blocks[current.row]; ++j) {
+        for (int k = 0; k < blocks[current.col]; ++k) {
+          crsm_cols[nnz + row_nnz[current.row] * j + col_nnz + k]
+              = block_offsets[current.col] + k;
+        }
+      }
     }
 
-    // All terms get assigned the position in the values array where
-    // their value is accumulated.
-    (*program)[current.index] = nnz;
+    (*program)[current.index] = col_nnz;
   }
 
-  for (int i = 1; i < num_rows + 1; ++i) {
+  for (int i = 1; i < num_cols + 1; ++i) {
     crsm_rows[i] += crsm_rows[i - 1];
   }
 
   return matrix;
 }
 
+// input is a matrix of dimesion <row_block_size, input_cols>
+// output is a matrix of dimension <col_block1_size, output_cols>
+//
+// Implement block multiplication O = I1' * I2.
+// I1 is block(0, col_block1_begin, row_block_size, col_block1_size) of input
+// I2 is block(0, col_block2_begin, row_block_size, col_block2_size) of input
+// O is block(0, 0, col_block1_size, col_block2_size) of output
+void ComputeBlockMultiplication(const int row_block_size,
+                                const int col_block1_size,
+                                const int col_block2_size,
+                                const int col_block1_begin,
+                                const int col_block2_begin,
+                                const int input_cols,
+                                const double *input,
+                                const int output_cols,
+                                double *output) {
+  for (int r = 0; r < row_block_size; ++r) {
+    for (int idx1 = 0; idx1 < col_block1_size; ++idx1) {
+      for (int idx2 = 0; idx2 < col_block2_size; ++idx2) {
+        output[output_cols * idx1 + idx2] +=
+            input[input_cols * r + col_block1_begin + idx1] *
+            input[input_cols * r + col_block2_begin + idx2];
+      }
+    }
+  }
+}
 }  // namespace
 
 CompressedRowSparseMatrix*
 CompressedRowSparseMatrix::CreateOuterProductMatrixAndProgram(
       const CompressedRowSparseMatrix& m,
+      const int stype,
       vector<int>* program) {
   CHECK_NOTNULL(program)->clear();
   CHECK_GT(m.num_nonzeros(), 0)
@@ -501,60 +696,152 @@
                 << "you found a bug in Ceres. Please report it.";
 
   vector<ProductTerm> product;
-  const vector<int>& row_blocks = m.row_blocks();
-  int row_block_begin = 0;
-  // Iterate over row blocks
-  for (int row_block = 0; row_block < row_blocks.size(); ++row_block) {
-    const int row_block_end = row_block_begin + row_blocks[row_block];
-    // Compute the outer product terms for just one row per row block.
-    const int r = row_block_begin;
-    // Compute the lower triangular part of the product.
-    for (int idx1 = m.rows()[r]; idx1 < m.rows()[r + 1]; ++idx1) {
-      for (int idx2 = m.rows()[r]; idx2 <= idx1; ++idx2) {
-        product.push_back(ProductTerm(m.cols()[idx1],
-                                      m.cols()[idx2],
-                                      product.size()));
+  const vector<int>& col_blocks = m.col_blocks();
+  const vector<int>& crsb_rows = m.crsb_rows();
+  const vector<int>& crsb_cols = m.crsb_cols();
+
+  // Give input matrix m in Compressed Row Sparse Block format
+  //     (row_block, col_block)
+  // represent each block multiplication
+  //     (row_block, col_block1)' X (row_block, col_block2)
+  // by its product term index and sort the product terms
+  //     (col_block1, col_block2, index)
+  //
+  // Due to the compression on rows, col_block is accessed through idx to
+  // crsb_cols.  So col_block is accessed as crsb_cols[idx] in the code.
+  for (int row_block = 1; row_block < crsb_rows.size(); ++row_block) {
+    for (int idx1 = crsb_rows[row_block - 1]; idx1 < crsb_rows[row_block];
+         ++idx1) {
+      if (stype > 0) { // Lower triangular matrix.
+        for (int idx2 = crsb_rows[row_block - 1]; idx2 <= idx1; ++idx2) {
+          product.push_back(ProductTerm(crsb_cols[idx1], crsb_cols[idx2],
+                                        product.size()));
+        }
+      }
+      else { // Upper triangular matrix.
+        for (int idx2 = idx1; idx2 < crsb_rows[row_block]; ++idx2) {
+          product.push_back(ProductTerm(crsb_cols[idx1], crsb_cols[idx2],
+                                        product.size()));
+        }
       }
     }
-    row_block_begin = row_block_end;
   }
-  CHECK_EQ(row_block_begin, m.num_rows());
+
   sort(product.begin(), product.end());
-  return CompressAndFillProgram(m.num_cols(), m.num_cols(), product, program);
+  return CompressAndFillProgram(m.num_cols(), col_blocks, product, program);
 }
 
+// Give input matrix m in Compressed Row Sparse Block format
+//     (row_block, col_block)
+// compute outerproduct m' * m as sum of block multiplications
+//     (row_block, col_block1)' X (row_block, col_block2)
+//
+// Given row_block of the input matrix m, we use m_row_begin to represent
+// the starting row of the row block and m_row_nnz to represent number of
+// nonzero in each row of the row block, then the rows belonging to
+// the row block can be represented as a dense matrix starting at
+//     m.values() + m.rows()[m_row_begin]
+// with dimension
+//     <m.row_blocks()[row_block], m_row_nnz>
+//
+// Then each input matrix block (row_block, col_block) can be represented as
+// a block of above dense matrix starting at position
+//     (0, m_col_nnz)
+// with size
+//     <m.row_blocks()[row_block], m.col_blocks()[col_block]>
+// where m_col_nnz is the number of nonzero before col_block in each row.
+//
+// The outerproduct block is represented similarly with m_row_begin,
+// m_row_nnz, m_col_nnz, etc. replaced by row_begin, row_nnz, col_nnz, etc.
+// The difference is, m_row_begin and m_col_nnz is counted during the
+// traverse of block multiplication, while row_begin and col_nnz are got
+// from pre-computed block_offsets and program.
+//
+// Due to the compression on rows, col_block is accessed through
+// idx to crsb_col vector. So col_block is accessed as crsb_col[idx]
+// in the code.
+//
+// Note this function produces a triangular matrix in block unit (i.e.
+// diagonal block is a normal block) instead of standard triangular matrix.
+// So there is no special handling for diagonal blocks.
 void CompressedRowSparseMatrix::ComputeOuterProduct(
     const CompressedRowSparseMatrix& m,
+    const int stype,
     const vector<int>& program,
     CompressedRowSparseMatrix* result) {
   result->SetZero();
   double* values = result->mutable_values();
-  const vector<int>& row_blocks = m.row_blocks();
+  const int* rows = result->rows();
+  const vector<int>& block_offsets = result->block_offsets();
 
   int cursor = 0;
-  int row_block_begin = 0;
   const double* m_values = m.values();
   const int* m_rows = m.rows();
-  // Iterate over row blocks.
-  for (int row_block = 0; row_block < row_blocks.size(); ++row_block) {
-    const int row_block_end = row_block_begin + row_blocks[row_block];
-    const int saved_cursor = cursor;
-    for (int r = row_block_begin; r < row_block_end; ++r) {
-      // Reuse the program segment for each row in this row block.
-      cursor = saved_cursor;
-      const int row_begin = m_rows[r];
-      const int row_end = m_rows[r + 1];
-      for (int idx1 = row_begin; idx1 < row_end; ++idx1) {
-        const double v1 =  m_values[idx1];
-        for (int idx2 = row_begin; idx2 <= idx1; ++idx2, ++cursor) {
-          values[program[cursor]] += v1 * m_values[idx2];
+  const vector<int>& row_blocks = m.row_blocks();
+  const vector<int>& col_blocks = m.col_blocks();
+  const vector<int>& crsb_rows = m.crsb_rows();
+  const vector<int>& crsb_cols = m.crsb_cols();
+
+#define COL_BLOCK1 (crsb_cols[idx1])
+#define COL_BLOCK2 (crsb_cols[idx2])
+
+  // Iterate row blocks.
+  for (int row_block = 0, m_row_begin = 0; row_block < row_blocks.size();
+       m_row_begin += row_blocks[row_block++]) {
+    // Non zeros are not stored consecutively across rows in a block.
+    // The gaps between rows is the number of nonzeros of the
+    // input matrix compressed row.
+    const int m_row_nnz =
+        m_rows[m_row_begin + 1] - m_rows[m_row_begin];
+
+    // Iterate (col_block1 x col_block2).
+    for (int idx1 = crsb_rows[row_block], m_col_nnz1 = 0;
+         idx1 < crsb_rows[row_block + 1];
+         m_col_nnz1 += col_blocks[COL_BLOCK1], ++idx1) {
+      // Non zeros are not stored consecutively across rows in a block.
+      // The gaps between rows is the number of nonzeros of the
+      // outerproduct matrix compressed row.
+      const int row_begin = block_offsets[COL_BLOCK1];
+      const int row_nnz = rows[row_begin + 1] - rows[row_begin];
+
+      if (stype > 0) {  // Lower triangular matrix.
+        for (int idx2 = crsb_rows[row_block], m_col_nnz2 = 0;
+             idx2 <= idx1;
+             m_col_nnz2 += col_blocks[COL_BLOCK2], ++idx2, ++cursor) {
+          int col_nnz = program[cursor];
+          ComputeBlockMultiplication(row_blocks[row_block],
+                                     col_blocks[COL_BLOCK1],
+                                     col_blocks[COL_BLOCK2],
+                                     m_col_nnz1,
+                                     m_col_nnz2,
+                                     m_row_nnz,
+                                     m_values + m_rows[m_row_begin],
+                                     row_nnz,
+                                     values + rows[row_begin] + col_nnz);
+        }
+      }
+      else { // Upper triangular matrix.
+        for (int idx2 = idx1, m_col_nnz2 = m_col_nnz1;
+             idx2 < crsb_rows[row_block + 1];
+             m_col_nnz2 += col_blocks[COL_BLOCK2], ++idx2, ++cursor) {
+          int col_nnz = program[cursor];
+          ComputeBlockMultiplication(row_blocks[row_block],
+                                     col_blocks[COL_BLOCK1],
+                                     col_blocks[COL_BLOCK2],
+                                     m_col_nnz1,
+                                     m_col_nnz2,
+                                     m_row_nnz,
+                                     m_values + m_rows[m_row_begin],
+                                     row_nnz,
+                                     values + rows[row_begin] + col_nnz);
         }
       }
     }
-    row_block_begin = row_block_end;
   }
 
-  CHECK_EQ(row_block_begin, m.num_rows());
+#undef COL_BLOCK1
+#undef COL_BLOCK2
+
   CHECK_EQ(cursor, program.size());
 }
 
diff --git a/internal/ceres/compressed_row_sparse_matrix.h b/internal/ceres/compressed_row_sparse_matrix.h
index 987339d..7b0d677 100644
--- a/internal/ceres/compressed_row_sparse_matrix.h
+++ b/internal/ceres/compressed_row_sparse_matrix.h
@@ -115,6 +115,15 @@
   const std::vector<int>& col_blocks() const { return col_blocks_; }
   std::vector<int>* mutable_col_blocks() { return &col_blocks_; }
 
+  const std::vector<int>& block_offsets() const { return block_offsets_; }
+  std::vector<int>* mutable_block_offsets() { return &block_offsets_; }
+
+  const std::vector<int>& crsb_rows() const { return crsb_rows_; }
+  std::vector<int>* mutable_crsb_rows() { return &crsb_rows_; }
+
+  const std::vector<int>& crsb_cols() const { return crsb_cols_; }
+  std::vector<int>* mutable_crsb_cols() { return &crsb_cols_; }
+
   // Destructive array resizing method.
   void SetMaxNumNonZeros(int num_nonzeros);
 
@@ -135,8 +144,8 @@
   // and create a CompressedRowSparseMatrix corresponding to it.
   //
   // Also compute a "program" vector, which for every term in the
-  // outer product points to the entry in the values array of the
-  // result matrix where it should be accumulated.
+  // block outer product provides the information for the entry
+  // in the values array of the result matrix where it should be accumulated.
   //
   // This program is used by the ComputeOuterProduct function below to
   // compute the outer product.
@@ -145,8 +154,13 @@
   // same sparsity structure, the program only stores the result for
   // one row per row block. The ComputeOuterProduct function reuses
   // this information for each row in the row block.
+  //
+  // stype controls the result matrix in upper or lower triangular matrix
+  //    stype = 1:  lower triangular matrix
+  //    stype = -1: higher triangular matrix
   static CompressedRowSparseMatrix* CreateOuterProductMatrixAndProgram(
       const CompressedRowSparseMatrix& m,
+      const int stype,
       std::vector<int>* program);
 
   // Compute the values array for the expression m.transpose() * m,
@@ -154,6 +168,7 @@
   // created using the CreateOuterProductMatrixAndProgram function
   // above.
   static void ComputeOuterProduct(const CompressedRowSparseMatrix& m,
+                                  const int stype,
                                   const std::vector<int>& program,
                                   CompressedRowSparseMatrix* result);
 
@@ -172,6 +187,18 @@
   std::vector<int> row_blocks_;
   std::vector<int> col_blocks_;
 
+  // For outerproduct matrix (J' * J), we pre-compute its block offsets
+  // information here for fast outerproduct computation in block unit.
+  // Since the outerproduct matrix is symmetric, we do not need to
+  // distinguish row or col block. In another word, this is the prefix
+  // sum of row_blocks_/col_blocks_.
+  std::vector<int> block_offsets_;
+
+  // If the matrix has an underlying block structure, then it can also
+  // carry with it compressed row sparse block information.
+  std::vector<int> crsb_rows_;
+  std::vector<int> crsb_cols_;
+
   CERES_DISALLOW_COPY_AND_ASSIGN(CompressedRowSparseMatrix);
 };
 
diff --git a/internal/ceres/compressed_row_sparse_matrix_test.cc b/internal/ceres/compressed_row_sparse_matrix_test.cc
index 385d372..18d95e6 100644
--- a/internal/ceres/compressed_row_sparse_matrix_test.cc
+++ b/internal/ceres/compressed_row_sparse_matrix_test.cc
@@ -89,6 +89,13 @@
     vector<int>* col_blocks = crsm->mutable_col_blocks();
     col_blocks->resize(num_cols);
     std::fill(col_blocks->begin(), col_blocks->end(), 1);
+
+    // With all blocks of size 1, crsb_rows and crsb_cols are equivalent to
+    // rows and cols.
+    std::copy(crsm->rows(), crsm->rows() + crsm->num_rows() + 1,
+              std::back_inserter(*crsm->mutable_crsb_rows()));
+    std::copy(crsm->cols(), crsm->cols() + crsm->num_nonzeros(),
+              std::back_inserter(*crsm->mutable_crsb_cols()));
   }
 
   int num_rows;
@@ -142,6 +149,9 @@
   // Clear the row and column blocks as these are purely scalar tests.
   crsm->mutable_row_blocks()->clear();
   crsm->mutable_col_blocks()->clear();
+  crsm->mutable_crsb_rows()->clear();
+  crsm->mutable_crsb_cols()->clear();
+
   for (int i = 0; i < num_rows; ++i) {
     tsm->Resize(num_rows - i, num_cols);
     crsm->DeleteRows(crsm->num_rows() - tsm->num_rows());
@@ -153,6 +163,8 @@
   // Clear the row and column blocks as these are purely scalar tests.
   crsm->mutable_row_blocks()->clear();
   crsm->mutable_col_blocks()->clear();
+  crsm->mutable_crsb_rows()->clear();
+  crsm->mutable_crsb_cols()->clear();
 
   for (int i = 0; i < num_rows; ++i) {
     TripletSparseMatrix tsm_appendage(*tsm);
@@ -182,6 +194,9 @@
   const vector<int> pre_row_blocks = crsm->row_blocks();
   const vector<int> pre_col_blocks = crsm->col_blocks();
 
+  const vector<int> pre_crsb_rows = crsm->crsb_rows();
+  const vector<int> pre_crsb_cols = crsm->crsb_cols();
+
   scoped_ptr<CompressedRowSparseMatrix> appendage(
       CompressedRowSparseMatrix::CreateBlockDiagonalMatrix(
           diagonal.get(), row_and_column_blocks));
@@ -202,9 +217,23 @@
   EXPECT_EQ(expected_row_blocks, crsm->row_blocks());
   EXPECT_EQ(expected_col_blocks, crsm->col_blocks());
 
+  EXPECT_EQ(crsm->crsb_cols().size(),
+             pre_crsb_cols.size() + row_and_column_blocks.size());
+  EXPECT_EQ(crsm->crsb_rows().size(),
+             pre_crsb_rows.size() + row_and_column_blocks.size());
+  for (int i = 0; i < row_and_column_blocks.size(); ++i) {
+    EXPECT_EQ(crsm->crsb_rows()[i + pre_crsb_rows.size()],
+              pre_crsb_rows.back() + i + 1);
+    EXPECT_EQ(crsm->crsb_cols()[i + pre_crsb_cols.size()], i);
+  }
+
   crsm->DeleteRows(num_diagonal_rows);
   EXPECT_EQ(crsm->row_blocks(), pre_row_blocks);
   EXPECT_EQ(crsm->col_blocks(), pre_col_blocks);
+
+  EXPECT_EQ(crsm->crsb_rows(), pre_crsb_rows);
+  EXPECT_EQ(crsm->crsb_cols(), pre_crsb_cols);
+
 }
 
 TEST_F(CompressedRowSparseMatrixTest, ToDenseMatrix) {
@@ -357,6 +386,14 @@
   matrix.mutable_col_blocks()->push_back(4);
   matrix.mutable_col_blocks()->push_back(2);
 
+  matrix.mutable_crsb_rows()->push_back(0);
+  matrix.mutable_crsb_rows()->push_back(2);
+  matrix.mutable_crsb_rows()->push_back(4);
+  matrix.mutable_crsb_cols()->push_back(0);
+  matrix.mutable_crsb_cols()->push_back(1);
+  matrix.mutable_crsb_cols()->push_back(0);
+  matrix.mutable_crsb_cols()->push_back(1);
+
   rows[0] = 0;
   cols[0] = 1;
   cols[1] = 3;
@@ -440,10 +477,16 @@
   vector<int> cols;
   vector<double> values;
 
+  vector<int> crsb_rows;
+  vector<int> crsb_cols;
+
   while (values.size() == 0) {
     int row_block_begin = 0;
+    crsb_rows.clear();
+    crsb_cols.clear();
     for (int r = 0; r < options.num_row_blocks; ++r) {
       int col_block_begin = 0;
+      crsb_rows.push_back(crsb_cols.size());
       for (int c = 0; c < options.num_col_blocks; ++c) {
         if (RandDouble() <= options.block_density) {
           for (int i = 0; i < row_blocks[r]; ++i) {
@@ -453,11 +496,13 @@
               values.push_back(RandNormal());
             }
           }
+          crsb_cols.push_back(c);
         }
         col_block_begin += col_blocks[c];
       }
       row_block_begin += row_blocks[r];
     }
+    crsb_rows.push_back(crsb_cols.size());
   }
 
   const int num_rows = std::accumulate(row_blocks.begin(), row_blocks.end(), 0);
@@ -472,6 +517,8 @@
   CompressedRowSparseMatrix* matrix = new CompressedRowSparseMatrix(tsm);
   (*matrix->mutable_row_blocks())  = row_blocks;
   (*matrix->mutable_col_blocks())  = col_blocks;
+  (*matrix->mutable_crsb_rows())  = crsb_rows;
+  (*matrix->mutable_crsb_cols())  = crsb_cols;
   return matrix;
 }
 
@@ -534,11 +581,14 @@
         cs_di* expected_outer_product =
             cxsparse.MatrixMatrixMultiply(&cs_matrix_transpose, cs_matrix);
 
+        // Use compressed row lower triangular matrix for cxsparse.
+        const int stype = 1;
         vector<int> program;
         scoped_ptr<CompressedRowSparseMatrix> outer_product(
             CompressedRowSparseMatrix::CreateOuterProductMatrixAndProgram(
-                *matrix, &program));
+                *matrix, stype, &program));
         CompressedRowSparseMatrix::ComputeOuterProduct(*matrix,
+                                                       stype,
                                                        program,
                                                        outer_product.get());
 
@@ -556,6 +606,7 @@
         expected_matrix.triangularView<Eigen::StrictlyLower>().setZero();
 
         ToDenseMatrix(&actual_outer_product, &actual_matrix);
+        actual_matrix.triangularView<Eigen::StrictlyLower>().setZero();
         const double diff_norm =
             (actual_matrix - expected_matrix).norm() / expected_matrix.norm();
         ASSERT_NEAR(diff_norm, 0.0, kTolerance)
diff --git a/internal/ceres/sparse_normal_cholesky_solver.cc b/internal/ceres/sparse_normal_cholesky_solver.cc
index a4c2c76..8401202 100644
--- a/internal/ceres/sparse_normal_cholesky_solver.cc
+++ b/internal/ceres/sparse_normal_cholesky_solver.cc
@@ -275,14 +275,21 @@
                                &event_logger);
   }
 
+  // Compute outerproduct to compressed row lower triangular matrix.
+  // Eigen SimplicialLDLT default uses lower triangular part of matrix.
+  // This can change to upper triangular matrix if specifying
+  //    Eigen::SimplicialLDLT< _MatrixType, _UpLo, _Ordering >
+  // with _UpLo = Upper.
+  const int stype = 1;
+
   if (outer_product_.get() == NULL) {
     outer_product_.reset(
         CompressedRowSparseMatrix::CreateOuterProductMatrixAndProgram(
-            *A, &pattern_));
+            *A, stype, &pattern_));
   }
 
   CompressedRowSparseMatrix::ComputeOuterProduct(
-      *A, pattern_, outer_product_.get());
+      *A, stype, pattern_, outer_product_.get());
 
   // Map to an upper triangular column major matrix.
   //
@@ -362,20 +369,21 @@
   summary.termination_type = LINEAR_SOLVER_SUCCESS;
   summary.message = "Success.";
 
+  // Compute outerproduct to compressed row lower triangular matrix.
+  // CXSparse Cholesky factorization uses lower triangular part of the matrix.
+  const int stype = 1;
+
   // Compute the normal equations. J'J delta = J'f and solve them
-  // using a sparse Cholesky factorization. Notice that when compared
-  // to SuiteSparse we have to explicitly compute the normal equations
-  // before they can be factorized. CHOLMOD/SuiteSparse on the other
-  // hand can just work off of Jt to compute the Cholesky
-  // factorization of the normal equations.
+  // using a sparse Cholesky factorization. Notice that we explicitly
+  // compute the normal equations before they can be factorized.
   if (outer_product_.get() == NULL) {
     outer_product_.reset(
         CompressedRowSparseMatrix::CreateOuterProductMatrixAndProgram(
-            *A, &pattern_));
+            *A, stype, &pattern_));
   }
 
   CompressedRowSparseMatrix::ComputeOuterProduct(
-      *A, pattern_, outer_product_.get());
+      *A, stype, pattern_, outer_product_.get());
   cs_di lhs =
       cxsparse_.CreateSparseMatrixTransposeView(outer_product_.get());
 
@@ -431,8 +439,26 @@
   summary.num_iterations = 1;
   summary.message = "Success.";
 
+  // Compute outerproduct to compressed row upper triangular matrix.
+  // This is the fastest option for the our default natural ordering
+  // (see comment in cholmod_factorize.c:205 in SuiteSparse).
+  const int stype = -1;
+
+  // Compute the normal equations. J'J delta = J'f and solve them
+  // using a sparse Cholesky factorization. Notice that we explicitly
+  // compute the normal equations before they can be factorized.
+  if (outer_product_.get() == NULL) {
+    outer_product_.reset(
+        CompressedRowSparseMatrix::CreateOuterProductMatrixAndProgram(
+            *A, stype, &pattern_));
+  }
+
+  CompressedRowSparseMatrix::ComputeOuterProduct(
+      *A, stype, pattern_, outer_product_.get());
+
   const int num_cols = A->num_cols();
-  cholmod_sparse lhs = ss_.CreateSparseMatrixTransposeView(A);
+  cholmod_sparse lhs =
+      ss_.CreateSparseMatrixTransposeView(outer_product_.get(), stype);
   event_logger.AddEvent("Setup");
 
   if (options_.dynamic_sparsity) {
@@ -443,7 +469,7 @@
     if (options_.use_postordering) {
       factor_ = ss_.BlockAnalyzeCholesky(&lhs,
                                          A->col_blocks(),
-                                         A->row_blocks(),
+                                         A->col_blocks(),
                                          &summary.message);
     } else {
       if (options_.dynamic_sparsity) {
diff --git a/internal/ceres/suitesparse.cc b/internal/ceres/suitesparse.cc
index 2018877..b136ce2 100644
--- a/internal/ceres/suitesparse.cc
+++ b/internal/ceres/suitesparse.cc
@@ -96,7 +96,7 @@
 }
 
 cholmod_sparse SuiteSparse::CreateSparseMatrixTransposeView(
-    CompressedRowSparseMatrix* A) {
+    CompressedRowSparseMatrix* A, const int stype) {
   cholmod_sparse m;
   m.nrow = A->num_cols();
   m.ncol = A->num_rows();
@@ -106,7 +106,7 @@
   m.i = reinterpret_cast<void*>(A->mutable_cols());
   m.x = reinterpret_cast<void*>(A->mutable_values());
   m.z = NULL;
-  m.stype = 0;  // Matrix is not symmetric.
+  m.stype = stype;
   m.itype = CHOLMOD_INT;
   m.xtype = CHOLMOD_REAL;
   m.dtype = CHOLMOD_DOUBLE;
diff --git a/internal/ceres/suitesparse.h b/internal/ceres/suitesparse.h
index 380d76e..380c16b 100644
--- a/internal/ceres/suitesparse.h
+++ b/internal/ceres/suitesparse.h
@@ -96,8 +96,11 @@
 
   // Create a cholmod_sparse wrapper around the contents of A. This is
   // a shallow object, which refers to the contents of A and does not
-  // use the SuiteSparse machinery to allocate memory.
-  cholmod_sparse CreateSparseMatrixTransposeView(CompressedRowSparseMatrix* A);
+  // use the SuiteSparse machinery to allocate memory. This can
+  // create a choldmod_sparse structure with different stype (upper
+  // triangular: stype= -1 or lower triangular: stype = 1).
+  cholmod_sparse CreateSparseMatrixTransposeView(CompressedRowSparseMatrix* A,
+                                                 const int stype);
 
   // Given a vector x, build a cholmod_dense vector of size out_size
   // with the first in_size entries copied from x. If x is NULL, then
diff --git a/internal/ceres/unsymmetric_linear_solver_test.cc b/internal/ceres/unsymmetric_linear_solver_test.cc
index 640009d..95797c5 100644
--- a/internal/ceres/unsymmetric_linear_solver_test.cc
+++ b/internal/ceres/unsymmetric_linear_solver_test.cc
@@ -78,6 +78,15 @@
       for (int i = 0; i < A_->num_cols(); ++i) {
         crsm->mutable_col_blocks()->push_back(1);
       }
+
+      // With all blocks of size 1, crsb_rows and crsb_cols are equivalent to
+      // rows and cols.
+      std::copy(crsm->rows(), crsm->rows() + crsm->num_rows() + 1,
+                std::back_inserter(*crsm->mutable_crsb_rows()));
+
+      std::copy(crsm->cols(), crsm->cols() + crsm->num_nonzeros(),
+                std::back_inserter(*crsm->mutable_crsb_cols()));
+
       transformed_A.reset(crsm);
     } else {
       LOG(FATAL) << "Unknown linear solver : " << options.type;