Optimize  the BlockCRSJacobiPreconditioner

Instead of iterating row-wise, use the row block structure
to update the preconditioner one row block at a time.

While benchmarking, I also found an opportunity to speed up
BlockSparseJacobiPreconditiner.

Before
-----------------------------------------------------------------------------------------
Benchmark                                               Time             CPU   Iterations
-----------------------------------------------------------------------------------------
BM_BlockSparseJacobiPreconditioner              132122392 ns    132103000 ns            5
BM_BlockCRSJacobiPreconditionerBA                73365217 ns     73335800 ns           10
BM_BlockSparseJacobiPreconditionerUnstructured   92407762 ns     92407714 ns            7
BM_BlockCRSJacobiPreconditionerUnstructured      72256367 ns     72256400 ns           10

After

-----------------------------------------------------------------------------------------
Benchmark                                               Time             CPU   Iterations
-----------------------------------------------------------------------------------------
BM_BlockSparseJacobiPreconditionerBA             39723456 ns     39722222 ns           18
BM_BlockCRSJacobiPreconditionerBA                46561625 ns     46561667 ns           15
BM_BlockSparseJacobiPreconditionerUnstructured   52676208 ns     52676167 ns           12
BM_BlockCRSJacobiPreconditionerUnstructured      54430564 ns     54421462 ns           13

Change-Id: I6da8e044b26f0c24481deee5722322a606e9e5ca
diff --git a/internal/ceres/block_jacobi_preconditioner.cc b/internal/ceres/block_jacobi_preconditioner.cc
index 779693c..7aaa605 100644
--- a/internal/ceres/block_jacobi_preconditioner.cc
+++ b/internal/ceres/block_jacobi_preconditioner.cc
@@ -36,6 +36,7 @@
 #include "ceres/block_structure.h"
 #include "ceres/casts.h"
 #include "ceres/internal/eigen.h"
+#include "ceres/small_blas.h"
 
 namespace ceres::internal {
 
@@ -69,7 +70,13 @@
           m_->GetCell(block_id, block_id, &r, &c, &row_stride, &col_stride);
       MatrixRef m(cell_info->values, row_stride, col_stride);
       ConstMatrixRef b(values + cell.position, row_block_size, col_block_size);
-      m.block(r, c, col_block_size, col_block_size) += b.transpose() * b;
+      // clang-format off
+      MatrixTransposeMatrixMultiply<Eigen::Dynamic, Eigen::Dynamic,
+          Eigen::Dynamic,Eigen::Dynamic, 1>(
+          values + cell.position, row_block_size,col_block_size,
+          values + cell.position, row_block_size,col_block_size,
+          cell_info->values,r, c,row_stride,col_stride);
+      // clang-format on
     }
   }
 
@@ -134,7 +141,9 @@
 bool BlockCRSJacobiPreconditioner::UpdateImpl(
     const CompressedRowSparseMatrix& A, const double* D) {
   const auto& col_blocks = A.col_blocks();
+  const auto& row_blocks = A.row_blocks();
   const int num_col_blocks = col_blocks.size();
+  const int num_row_blocks = row_blocks.size();
 
   const int* a_rows = A.rows();
   const int* a_cols = A.cols();
@@ -145,20 +154,30 @@
   const int* m_rows = m_->rows();
 
   const int num_rows = A.num_rows();
-  // The following loop can likely be optimized by exploiting the fact that each
-  // row block has exactly the same sparsity structure.
-  for (int r = 0; r < num_rows; ++r) {
+  int r = 0;
+  for (int i = 0; i < num_row_blocks; ++i) {
+    const int row_block_size = row_blocks[i];
+    const int row_nnz = a_rows[r + 1] - a_rows[r];
+    ConstMatrixRef row_block(a_values + a_rows[r], row_block_size, row_nnz);
     int idx = a_rows[r];
-    while (idx < a_rows[r + 1]) {
+    int c = 0;
+    while (c < row_nnz) {
+      const int idx = a_rows[r] + c;
       const int col = a_cols[idx];
       const int col_block_size = m_rows[col + 1] - m_rows[col];
+
       // We make use of the fact that the entire diagonal block is stored
       // contiguously in memory as a row-major matrix.
       MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size);
-      ConstVectorRef b(a_values + idx, col_block_size);
-      m.selfadjointView<Eigen::Upper>().rankUpdate(b);
-      idx += col_block_size;
+
+      // We do not have a row_stride version of MatrixTransposeMatrixMultiply,
+      // otherwise we could use it here to further speed up the following
+      // expression.
+      auto b = row_block.middleCols(c, col_block_size);
+      m.noalias() += b.transpose() * b;
+      c += col_block_size;
     }
+    r += row_block_size;
   }
 
   for (int i = 0, col = 0; i < num_col_blocks; ++i) {
@@ -170,8 +189,7 @@
           ConstVectorRef(D + col, col_block_size).array().square().matrix();
     }
 
-    m = m.selfadjointView<Eigen::Upper>().llt().solve(
-        Matrix::Identity(col_block_size, col_block_size));
+    m = m.llt().solve(Matrix::Identity(col_block_size, col_block_size));
     col += col_block_size;
   }