Optimize the BlockCRSJacobiPreconditioner Instead of iterating row-wise, use the row block structure to update the preconditioner one row block at a time. While benchmarking, I also found an opportunity to speed up BlockSparseJacobiPreconditiner. Before ----------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------------------------------------- BM_BlockSparseJacobiPreconditioner 132122392 ns 132103000 ns 5 BM_BlockCRSJacobiPreconditionerBA 73365217 ns 73335800 ns 10 BM_BlockSparseJacobiPreconditionerUnstructured 92407762 ns 92407714 ns 7 BM_BlockCRSJacobiPreconditionerUnstructured 72256367 ns 72256400 ns 10 After ----------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------------------------------------- BM_BlockSparseJacobiPreconditionerBA 39723456 ns 39722222 ns 18 BM_BlockCRSJacobiPreconditionerBA 46561625 ns 46561667 ns 15 BM_BlockSparseJacobiPreconditionerUnstructured 52676208 ns 52676167 ns 12 BM_BlockCRSJacobiPreconditionerUnstructured 54430564 ns 54421462 ns 13 Change-Id: I6da8e044b26f0c24481deee5722322a606e9e5ca
diff --git a/internal/ceres/block_jacobi_preconditioner.cc b/internal/ceres/block_jacobi_preconditioner.cc index 779693c..7aaa605 100644 --- a/internal/ceres/block_jacobi_preconditioner.cc +++ b/internal/ceres/block_jacobi_preconditioner.cc
@@ -36,6 +36,7 @@ #include "ceres/block_structure.h" #include "ceres/casts.h" #include "ceres/internal/eigen.h" +#include "ceres/small_blas.h" namespace ceres::internal { @@ -69,7 +70,13 @@ m_->GetCell(block_id, block_id, &r, &c, &row_stride, &col_stride); MatrixRef m(cell_info->values, row_stride, col_stride); ConstMatrixRef b(values + cell.position, row_block_size, col_block_size); - m.block(r, c, col_block_size, col_block_size) += b.transpose() * b; + // clang-format off + MatrixTransposeMatrixMultiply<Eigen::Dynamic, Eigen::Dynamic, + Eigen::Dynamic,Eigen::Dynamic, 1>( + values + cell.position, row_block_size,col_block_size, + values + cell.position, row_block_size,col_block_size, + cell_info->values,r, c,row_stride,col_stride); + // clang-format on } } @@ -134,7 +141,9 @@ bool BlockCRSJacobiPreconditioner::UpdateImpl( const CompressedRowSparseMatrix& A, const double* D) { const auto& col_blocks = A.col_blocks(); + const auto& row_blocks = A.row_blocks(); const int num_col_blocks = col_blocks.size(); + const int num_row_blocks = row_blocks.size(); const int* a_rows = A.rows(); const int* a_cols = A.cols(); @@ -145,20 +154,30 @@ const int* m_rows = m_->rows(); const int num_rows = A.num_rows(); - // The following loop can likely be optimized by exploiting the fact that each - // row block has exactly the same sparsity structure. - for (int r = 0; r < num_rows; ++r) { + int r = 0; + for (int i = 0; i < num_row_blocks; ++i) { + const int row_block_size = row_blocks[i]; + const int row_nnz = a_rows[r + 1] - a_rows[r]; + ConstMatrixRef row_block(a_values + a_rows[r], row_block_size, row_nnz); int idx = a_rows[r]; - while (idx < a_rows[r + 1]) { + int c = 0; + while (c < row_nnz) { + const int idx = a_rows[r] + c; const int col = a_cols[idx]; const int col_block_size = m_rows[col + 1] - m_rows[col]; + // We make use of the fact that the entire diagonal block is stored // contiguously in memory as a row-major matrix. MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size); - ConstVectorRef b(a_values + idx, col_block_size); - m.selfadjointView<Eigen::Upper>().rankUpdate(b); - idx += col_block_size; + + // We do not have a row_stride version of MatrixTransposeMatrixMultiply, + // otherwise we could use it here to further speed up the following + // expression. + auto b = row_block.middleCols(c, col_block_size); + m.noalias() += b.transpose() * b; + c += col_block_size; } + r += row_block_size; } for (int i = 0, col = 0; i < num_col_blocks; ++i) { @@ -170,8 +189,7 @@ ConstVectorRef(D + col, col_block_size).array().square().matrix(); } - m = m.selfadjointView<Eigen::Upper>().llt().solve( - Matrix::Identity(col_block_size, col_block_size)); + m = m.llt().solve(Matrix::Identity(col_block_size, col_block_size)); col += col_block_size; }