Optimize the BlockCRSJacobiPreconditioner Instead of iterating row-wise, use the row block structure to update the preconditioner one row block at a time. While benchmarking, I also found an opportunity to speed up BlockSparseJacobiPreconditiner. Before ----------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------------------------------------- BM_BlockSparseJacobiPreconditioner 132122392 ns 132103000 ns 5 BM_BlockCRSJacobiPreconditionerBA 73365217 ns 73335800 ns 10 BM_BlockSparseJacobiPreconditionerUnstructured 92407762 ns 92407714 ns 7 BM_BlockCRSJacobiPreconditionerUnstructured 72256367 ns 72256400 ns 10 After ----------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------------------------------------- BM_BlockSparseJacobiPreconditionerBA 39723456 ns 39722222 ns 18 BM_BlockCRSJacobiPreconditionerBA 46561625 ns 46561667 ns 15 BM_BlockSparseJacobiPreconditionerUnstructured 52676208 ns 52676167 ns 12 BM_BlockCRSJacobiPreconditionerUnstructured 54430564 ns 54421462 ns 13 Change-Id: I6da8e044b26f0c24481deee5722322a606e9e5ca

commit: de0f74e40a49706867287431b4466c1e7a90fbd6 [log] [tgz]
author: Sameer Agarwal <sameeragarwal@google.com> Thu Aug 25 16:02:14 2022 +0530
committer: Sameer Agarwal <sameeragarwal@google.com> Sun Aug 28 17:24:28 2022 -0700
tree: b6f2a61436a50f117f0e0faee9127fce86e4e662
parent: ba65ddd311239c26dd8950ab37f504d37bf6fa44 [diff]
diff --git a/internal/ceres/block_jacobi_preconditioner.cc b/internal/ceres/block_jacobi_preconditioner.cc
index 779693c..7aaa605 100644
--- a/internal/ceres/block_jacobi_preconditioner.cc
+++ b/internal/ceres/block_jacobi_preconditioner.cc

@@ -36,6 +36,7 @@
 #include "ceres/block_structure.h"
 #include "ceres/casts.h"
 #include "ceres/internal/eigen.h"
+#include "ceres/small_blas.h"
 
 namespace ceres::internal {
 
@@ -69,7 +70,13 @@
           m_->GetCell(block_id, block_id, &r, &c, &row_stride, &col_stride);
       MatrixRef m(cell_info->values, row_stride, col_stride);
       ConstMatrixRef b(values + cell.position, row_block_size, col_block_size);
-      m.block(r, c, col_block_size, col_block_size) += b.transpose() * b;
+      // clang-format off
+      MatrixTransposeMatrixMultiply<Eigen::Dynamic, Eigen::Dynamic,
+          Eigen::Dynamic,Eigen::Dynamic, 1>(
+          values + cell.position, row_block_size,col_block_size,
+          values + cell.position, row_block_size,col_block_size,
+          cell_info->values,r, c,row_stride,col_stride);
+      // clang-format on
     }
   }
 
@@ -134,7 +141,9 @@
 bool BlockCRSJacobiPreconditioner::UpdateImpl(
     const CompressedRowSparseMatrix& A, const double* D) {
   const auto& col_blocks = A.col_blocks();
+  const auto& row_blocks = A.row_blocks();
   const int num_col_blocks = col_blocks.size();
+  const int num_row_blocks = row_blocks.size();
 
   const int* a_rows = A.rows();
   const int* a_cols = A.cols();
@@ -145,20 +154,30 @@
   const int* m_rows = m_->rows();
 
   const int num_rows = A.num_rows();
-  // The following loop can likely be optimized by exploiting the fact that each
-  // row block has exactly the same sparsity structure.
-  for (int r = 0; r < num_rows; ++r) {
+  int r = 0;
+  for (int i = 0; i < num_row_blocks; ++i) {
+    const int row_block_size = row_blocks[i];
+    const int row_nnz = a_rows[r + 1] - a_rows[r];
+    ConstMatrixRef row_block(a_values + a_rows[r], row_block_size, row_nnz);
     int idx = a_rows[r];
-    while (idx < a_rows[r + 1]) {
+    int c = 0;
+    while (c < row_nnz) {
+      const int idx = a_rows[r] + c;
       const int col = a_cols[idx];
       const int col_block_size = m_rows[col + 1] - m_rows[col];
+
       // We make use of the fact that the entire diagonal block is stored
       // contiguously in memory as a row-major matrix.
       MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size);
-      ConstVectorRef b(a_values + idx, col_block_size);
-      m.selfadjointView<Eigen::Upper>().rankUpdate(b);
-      idx += col_block_size;
+
+      // We do not have a row_stride version of MatrixTransposeMatrixMultiply,
+      // otherwise we could use it here to further speed up the following
+      // expression.
+      auto b = row_block.middleCols(c, col_block_size);
+      m.noalias() += b.transpose() * b;
+      c += col_block_size;
     }
+    r += row_block_size;
   }
 
   for (int i = 0, col = 0; i < num_col_blocks; ++i) {
@@ -170,8 +189,7 @@
           ConstVectorRef(D + col, col_block_size).array().square().matrix();
     }
 
-    m = m.selfadjointView<Eigen::Upper>().llt().solve(
-        Matrix::Identity(col_block_size, col_block_size));
+    m = m.llt().solve(Matrix::Identity(col_block_size, col_block_size));
     col += col_block_size;
   }
commit	de0f74e40a49706867287431b4466c1e7a90fbd6	[log] [tgz]
author	Sameer Agarwal <sameeragarwal@google.com>	Thu Aug 25 16:02:14 2022 +0530
committer	Sameer Agarwal <sameeragarwal@google.com>	Sun Aug 28 17:24:28 2022 -0700
tree	b6f2a61436a50f117f0e0faee9127fce86e4e662
parent	ba65ddd311239c26dd8950ab37f504d37bf6fa44 [diff]