Optimize the BlockCRSJacobiPreconditioner
Instead of iterating row-wise, use the row block structure
to update the preconditioner one row block at a time.
While benchmarking, I also found an opportunity to speed up
BlockSparseJacobiPreconditiner.
Before
-----------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------------------------------------------
BM_BlockSparseJacobiPreconditioner 132122392 ns 132103000 ns 5
BM_BlockCRSJacobiPreconditionerBA 73365217 ns 73335800 ns 10
BM_BlockSparseJacobiPreconditionerUnstructured 92407762 ns 92407714 ns 7
BM_BlockCRSJacobiPreconditionerUnstructured 72256367 ns 72256400 ns 10
After
-----------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------------------------------------------
BM_BlockSparseJacobiPreconditionerBA 39723456 ns 39722222 ns 18
BM_BlockCRSJacobiPreconditionerBA 46561625 ns 46561667 ns 15
BM_BlockSparseJacobiPreconditionerUnstructured 52676208 ns 52676167 ns 12
BM_BlockCRSJacobiPreconditionerUnstructured 54430564 ns 54421462 ns 13
Change-Id: I6da8e044b26f0c24481deee5722322a606e9e5ca
diff --git a/internal/ceres/block_jacobi_preconditioner.cc b/internal/ceres/block_jacobi_preconditioner.cc
index 779693c..7aaa605 100644
--- a/internal/ceres/block_jacobi_preconditioner.cc
+++ b/internal/ceres/block_jacobi_preconditioner.cc
@@ -36,6 +36,7 @@
#include "ceres/block_structure.h"
#include "ceres/casts.h"
#include "ceres/internal/eigen.h"
+#include "ceres/small_blas.h"
namespace ceres::internal {
@@ -69,7 +70,13 @@
m_->GetCell(block_id, block_id, &r, &c, &row_stride, &col_stride);
MatrixRef m(cell_info->values, row_stride, col_stride);
ConstMatrixRef b(values + cell.position, row_block_size, col_block_size);
- m.block(r, c, col_block_size, col_block_size) += b.transpose() * b;
+ // clang-format off
+ MatrixTransposeMatrixMultiply<Eigen::Dynamic, Eigen::Dynamic,
+ Eigen::Dynamic,Eigen::Dynamic, 1>(
+ values + cell.position, row_block_size,col_block_size,
+ values + cell.position, row_block_size,col_block_size,
+ cell_info->values,r, c,row_stride,col_stride);
+ // clang-format on
}
}
@@ -134,7 +141,9 @@
bool BlockCRSJacobiPreconditioner::UpdateImpl(
const CompressedRowSparseMatrix& A, const double* D) {
const auto& col_blocks = A.col_blocks();
+ const auto& row_blocks = A.row_blocks();
const int num_col_blocks = col_blocks.size();
+ const int num_row_blocks = row_blocks.size();
const int* a_rows = A.rows();
const int* a_cols = A.cols();
@@ -145,20 +154,30 @@
const int* m_rows = m_->rows();
const int num_rows = A.num_rows();
- // The following loop can likely be optimized by exploiting the fact that each
- // row block has exactly the same sparsity structure.
- for (int r = 0; r < num_rows; ++r) {
+ int r = 0;
+ for (int i = 0; i < num_row_blocks; ++i) {
+ const int row_block_size = row_blocks[i];
+ const int row_nnz = a_rows[r + 1] - a_rows[r];
+ ConstMatrixRef row_block(a_values + a_rows[r], row_block_size, row_nnz);
int idx = a_rows[r];
- while (idx < a_rows[r + 1]) {
+ int c = 0;
+ while (c < row_nnz) {
+ const int idx = a_rows[r] + c;
const int col = a_cols[idx];
const int col_block_size = m_rows[col + 1] - m_rows[col];
+
// We make use of the fact that the entire diagonal block is stored
// contiguously in memory as a row-major matrix.
MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size);
- ConstVectorRef b(a_values + idx, col_block_size);
- m.selfadjointView<Eigen::Upper>().rankUpdate(b);
- idx += col_block_size;
+
+ // We do not have a row_stride version of MatrixTransposeMatrixMultiply,
+ // otherwise we could use it here to further speed up the following
+ // expression.
+ auto b = row_block.middleCols(c, col_block_size);
+ m.noalias() += b.transpose() * b;
+ c += col_block_size;
}
+ r += row_block_size;
}
for (int i = 0, col = 0; i < num_col_blocks; ++i) {
@@ -170,8 +189,7 @@
ConstVectorRef(D + col, col_block_size).array().square().matrix();
}
- m = m.selfadjointView<Eigen::Upper>().llt().solve(
- Matrix::Identity(col_block_size, col_block_size));
+ m = m.llt().solve(Matrix::Identity(col_block_size, col_block_size));
col += col_block_size;
}