Parallelize block_jacobi_preconditioner Use ParallelFor to parallelize both versions of the block Jacobi preconditioner. Also add benchmarks for varying number of threads. Benchmark on M1 Mac Pro Before: ----------------------------------------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------------------------------------- BM_BlockSparseJacobiPreconditionerBA 44847927 ns 44788313 ns 16 BM_BlockCRSJacobiPreconditionerBA 48772330 ns 48723571 ns 14 BM_BlockSparseJacobiPreconditionerUnstructured 62385231 ns 62306818 ns 11 BM_BlockCRSJacobiPreconditionerUnstructured 60671473 ns 60577727 ns 11 After: -------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations -------------------------------------------------------------------------------------------- BM_BlockSparseJacobiPreconditionerBA/1 53314862 ns 53302308 ns 13 BM_BlockSparseJacobiPreconditionerBA/2 33601214 ns 33295143 ns 21 BM_BlockSparseJacobiPreconditionerBA/4 28162794 ns 27224167 ns 30 BM_BlockSparseJacobiPreconditionerBA/8 31402448 ns 28038760 ns 25 BM_BlockSparseJacobiPreconditionerBA/16 30820813 ns 22625233 ns 30 BM_BlockCRSJacobiPreconditionerBA/1 60348194 ns 60332167 ns 12 BM_BlockCRSJacobiPreconditionerBA/2 35489954 ns 34782050 ns 20 BM_BlockCRSJacobiPreconditionerBA/4 23636360 ns 22547032 ns 31 BM_BlockCRSJacobiPreconditionerBA/8 31688798 ns 27857800 ns 25 BM_BlockCRSJacobiPreconditionerBA/16 30806695 ns 20562516 ns 31 BM_BlockSparseJacobiPreconditionerUnstructured/1 59793396 ns 59788583 ns 12 BM_BlockSparseJacobiPreconditionerUnstructured/2 35192900 ns 34968900 ns 20 BM_BlockSparseJacobiPreconditionerUnstructured/4 30171145 ns 28924480 ns 25 BM_BlockSparseJacobiPreconditionerUnstructured/8 24982583 ns 23193172 ns 29 BM_BlockSparseJacobiPreconditionerUnstructured/16 23370546 ns 18389694 ns 36 BM_BlockCRSJacobiPreconditionerUnstructured/1 63204538 ns 63204545 ns 11 BM_BlockCRSJacobiPreconditionerUnstructured/2 34466060 ns 34193429 ns 21 BM_BlockCRSJacobiPreconditionerUnstructured/4 22712230 ns 20491147 ns 34 BM_BlockCRSJacobiPreconditionerUnstructured/8 16701833 ns 16190395 ns 43 BM_BlockCRSJacobiPreconditionerUnstructured/16 16762565 ns 12857304 ns 56 Note that single threaded performance gets worse. Performance goes up for 2 and 4 threads and then essentially stalls. Change-Id: I96a5d2f719545e14c03d73e71c8c0564e8c1c729
diff --git a/internal/ceres/block_jacobi_preconditioner.cc b/internal/ceres/block_jacobi_preconditioner.cc index b452e7d..b7ee002 100644 --- a/internal/ceres/block_jacobi_preconditioner.cc +++ b/internal/ceres/block_jacobi_preconditioner.cc
@@ -39,12 +39,14 @@ #include "ceres/block_structure.h" #include "ceres/casts.h" #include "ceres/internal/eigen.h" +#include "ceres/parallel_for.h" #include "ceres/small_blas.h" namespace ceres::internal { BlockSparseJacobiPreconditioner::BlockSparseJacobiPreconditioner( - const BlockSparseMatrix& A) { + Preconditioner::Options options, const BlockSparseMatrix& A) + : options_(std::move(options)) { m_ = std::make_unique<BlockRandomAccessDiagonalMatrix>( A.block_structure()->cols); } @@ -56,48 +58,63 @@ const CompressedRowBlockStructure* bs = A.block_structure(); const double* values = A.values(); m_->SetZero(); - for (int i = 0; i < bs->rows.size(); ++i) { - const int row_block_size = bs->rows[i].block.size; - const std::vector<Cell>& cells = bs->rows[i].cells; - for (const auto& cell : cells) { - const int block_id = cell.block_id; - const int col_block_size = bs->cols[block_id].size; - int r, c, row_stride, col_stride; - CellInfo* cell_info = - m_->GetCell(block_id, block_id, &r, &c, &row_stride, &col_stride); - MatrixRef m(cell_info->values, row_stride, col_stride); - ConstMatrixRef b(values + cell.position, row_block_size, col_block_size); - // clang-format off - MatrixTransposeMatrixMultiply<Eigen::Dynamic, Eigen::Dynamic, - Eigen::Dynamic,Eigen::Dynamic, 1>( - values + cell.position, row_block_size,col_block_size, - values + cell.position, row_block_size,col_block_size, - cell_info->values,r, c,row_stride,col_stride); - // clang-format on - } - } + ParallelFor(options_.context, + 0, + bs->rows.size(), + options_.num_threads, + [this, bs, values](int i) { + const int row_block_size = bs->rows[i].block.size; + const std::vector<Cell>& cells = bs->rows[i].cells; + for (const auto& cell : cells) { + const int block_id = cell.block_id; + const int col_block_size = bs->cols[block_id].size; + int r, c, row_stride, col_stride; + CellInfo* cell_info = m_->GetCell( + block_id, block_id, &r, &c, &row_stride, &col_stride); + MatrixRef m(cell_info->values, row_stride, col_stride); + ConstMatrixRef b( + values + cell.position, row_block_size, col_block_size); + std::lock_guard<std::mutex> l(cell_info->m); + // clang-format off + MatrixTransposeMatrixMultiply<Eigen::Dynamic, Eigen::Dynamic, + Eigen::Dynamic,Eigen::Dynamic, 1>( + values + cell.position, row_block_size,col_block_size, + values + cell.position, row_block_size,col_block_size, + cell_info->values,r, c,row_stride,col_stride); + // clang-format on + } + }); if (D != nullptr) { // Add the diagonal. - int position = 0; - for (int i = 0; i < bs->cols.size(); ++i) { - const int block_size = bs->cols[i].size; - int r, c, row_stride, col_stride; - CellInfo* cell_info = m_->GetCell(i, i, &r, &c, &row_stride, &col_stride); - MatrixRef m(cell_info->values, row_stride, col_stride); - m.block(r, c, block_size, block_size).diagonal() += - ConstVectorRef(D + position, block_size).array().square().matrix(); - position += block_size; - } + ParallelFor(options_.context, + 0, + bs->cols.size(), + options_.num_threads, + [this, bs, D](int i) { + const int block_size = bs->cols[i].size; + int r, c, row_stride, col_stride; + CellInfo* cell_info = + m_->GetCell(i, i, &r, &c, &row_stride, &col_stride); + MatrixRef m(cell_info->values, row_stride, col_stride); + m.block(r, c, block_size, block_size).diagonal() += + ConstVectorRef(D + bs->cols[i].position, block_size) + .array() + .square() + .matrix(); + }); } + // TODO(sameeragarwal): Once matrices are threaded, this call to invert should + // also be parallelized. m_->Invert(); return true; } BlockCRSJacobiPreconditioner::BlockCRSJacobiPreconditioner( - const CompressedRowSparseMatrix& A) { + Preconditioner::Options options, const CompressedRowSparseMatrix& A) + : options_(std::move(options)), locks_(A.col_blocks().size()) { auto& col_blocks = A.col_blocks(); // Compute the number of non-zeros in the preconditioner. This is needed so @@ -126,6 +143,12 @@ } } + // In reality we only need num_col_blocks locks, however that would require + // that in UpdateImpl we are able to look up the column block from the it + // first column. To save ourselves this map we will instead spend a few extra + // lock objects. + std::vector<std::mutex> locks(A.num_cols()); + locks_.swap(locks); CHECK_EQ(m_rows[A.num_cols()], m_nnz); } @@ -141,49 +164,60 @@ const int* a_rows = A.rows(); const int* a_cols = A.cols(); const double* a_values = A.values(); - - m_->SetZero(); double* m_values = m_->mutable_values(); const int* m_rows = m_->rows(); - for (int i = 0; i < num_row_blocks; ++i) { - const int row = row_blocks[i].position; - const int row_block_size = row_blocks[i].size; - const int row_nnz = a_rows[row + 1] - a_rows[row]; - ConstMatrixRef row_block(a_values + a_rows[row], row_block_size, row_nnz); - int c = 0; - while (c < row_nnz) { - const int idx = a_rows[row] + c; - const int col = a_cols[idx]; - const int col_block_size = m_rows[col + 1] - m_rows[col]; + m_->SetZero(); - // We make use of the fact that the entire diagonal block is stored - // contiguously in memory as a row-major matrix. - MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size); + ParallelFor( + options_.context, + 0, + num_row_blocks, + options_.num_threads, + [this, row_blocks, a_rows, a_cols, a_values, m_values, m_rows](int i) { + const int row = row_blocks[i].position; + const int row_block_size = row_blocks[i].size; + const int row_nnz = a_rows[row + 1] - a_rows[row]; + ConstMatrixRef row_block( + a_values + a_rows[row], row_block_size, row_nnz); + int c = 0; + while (c < row_nnz) { + const int idx = a_rows[row] + c; + const int col = a_cols[idx]; + const int col_block_size = m_rows[col + 1] - m_rows[col]; - // We do not have a row_stride version of MatrixTransposeMatrixMultiply, - // otherwise we could use it here to further speed up the following - // expression. - auto b = row_block.middleCols(c, col_block_size); - m.noalias() += b.transpose() * b; - c += col_block_size; - } - } + // We make use of the fact that the entire diagonal block is + // stored contiguously in memory as a row-major matrix. + MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size); + // We do not have a row_stride version of + // MatrixTransposeMatrixMultiply, otherwise we could use it + // here to further speed up the following expression. + auto b = row_block.middleCols(c, col_block_size); + std::lock_guard<std::mutex> l(locks_[col]); + m.noalias() += b.transpose() * b; + c += col_block_size; + } + }); - for (int i = 0; i < num_col_blocks; ++i) { - const int col = col_blocks[i].position; - const int col_block_size = col_blocks[i].size; - MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size); + ParallelFor( + options_.context, + 0, + num_col_blocks, + options_.num_threads, + [col_blocks, m_rows, m_values, D](int i) { + const int col = col_blocks[i].position; + const int col_block_size = col_blocks[i].size; + MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size); - if (D != nullptr) { - m.diagonal() += - ConstVectorRef(D + col, col_block_size).array().square().matrix(); - } + if (D != nullptr) { + m.diagonal() += + ConstVectorRef(D + col, col_block_size).array().square().matrix(); + } - // TODO(sameeragarwal): Deal with Cholesky inversion failure here and - // elsewhere. - m = m.llt().solve(Matrix::Identity(col_block_size, col_block_size)); - } + // TODO(sameeragarwal): Deal with Cholesky inversion failure here and + // elsewhere. + m = m.llt().solve(Matrix::Identity(col_block_size, col_block_size)); + }); return true; }
diff --git a/internal/ceres/block_jacobi_preconditioner.h b/internal/ceres/block_jacobi_preconditioner.h index 265b7d6..a39ad12 100644 --- a/internal/ceres/block_jacobi_preconditioner.h +++ b/internal/ceres/block_jacobi_preconditioner.h
@@ -52,7 +52,8 @@ : public BlockSparseMatrixPreconditioner { public: // A must remain valid while the BlockJacobiPreconditioner is. - explicit BlockSparseJacobiPreconditioner(const BlockSparseMatrix& A); + explicit BlockSparseJacobiPreconditioner(Preconditioner::Options, + const BlockSparseMatrix& A); ~BlockSparseJacobiPreconditioner() override; void RightMultiplyAndAccumulate(const double* x, double* y) const final { return m_->RightMultiplyAndAccumulate(x, y); @@ -64,6 +65,7 @@ private: bool UpdateImpl(const BlockSparseMatrix& A, const double* D) final; + Preconditioner::Options options_; std::unique_ptr<BlockRandomAccessDiagonalMatrix> m_; }; @@ -73,7 +75,8 @@ : public CompressedRowSparseMatrixPreconditioner { public: // A must remain valid while the BlockJacobiPreconditioner is. - explicit BlockCRSJacobiPreconditioner(const CompressedRowSparseMatrix& A); + explicit BlockCRSJacobiPreconditioner(Preconditioner::Options options, + const CompressedRowSparseMatrix& A); ~BlockCRSJacobiPreconditioner() override; void RightMultiplyAndAccumulate(const double* x, double* y) const final { m_->RightMultiplyAndAccumulate(x, y); @@ -85,6 +88,8 @@ private: bool UpdateImpl(const CompressedRowSparseMatrix& A, const double* D) final; + Preconditioner::Options options_; + std::vector<std::mutex> locks_; std::unique_ptr<CompressedRowSparseMatrix> m_; };
diff --git a/internal/ceres/block_jacobi_preconditioner_benchmark.cc b/internal/ceres/block_jacobi_preconditioner_benchmark.cc index 1d8ad10..d2508c4 100644 --- a/internal/ceres/block_jacobi_preconditioner_benchmark.cc +++ b/internal/ceres/block_jacobi_preconditioner_benchmark.cc
@@ -59,14 +59,26 @@ std::mt19937 prng; auto jacobian = CreateFakeBundleAdjustmentJacobian( kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); - BlockSparseJacobiPreconditioner p(*jacobian); + + Preconditioner::Options preconditioner_options; + ContextImpl context; + preconditioner_options.context = &context; + preconditioner_options.num_threads = state.range(0); + context.EnsureMinimumThreads(preconditioner_options.num_threads); + BlockSparseJacobiPreconditioner p(preconditioner_options, *jacobian); + Vector d = Vector::Ones(jacobian->num_cols()); for (auto _ : state) { p.Update(*jacobian, d.data()); } } -BENCHMARK(BM_BlockSparseJacobiPreconditionerBA); +BENCHMARK(BM_BlockSparseJacobiPreconditionerBA) + ->Arg(1) + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16); static void BM_BlockCRSJacobiPreconditionerBA(benchmark::State& state) { std::mt19937 prng; @@ -76,14 +88,25 @@ CompressedRowSparseMatrix jacobian_crs( jacobian->num_rows(), jacobian->num_cols(), jacobian->num_nonzeros()); jacobian->ToCompressedRowSparseMatrix(&jacobian_crs); - BlockCRSJacobiPreconditioner p(jacobian_crs); + Preconditioner::Options preconditioner_options; + ContextImpl context; + preconditioner_options.context = &context; + preconditioner_options.num_threads = state.range(0); + context.EnsureMinimumThreads(preconditioner_options.num_threads); + BlockCRSJacobiPreconditioner p(preconditioner_options, jacobian_crs); + Vector d = Vector::Ones(jacobian_crs.num_cols()); for (auto _ : state) { p.Update(jacobian_crs, d.data()); } } -BENCHMARK(BM_BlockCRSJacobiPreconditionerBA); +BENCHMARK(BM_BlockCRSJacobiPreconditionerBA) + ->Arg(1) + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16); static void BM_BlockSparseJacobiPreconditionerUnstructured( benchmark::State& state) { @@ -98,14 +121,25 @@ std::mt19937 prng; auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng); - BlockSparseJacobiPreconditioner p(*jacobian); + Preconditioner::Options preconditioner_options; + ContextImpl context; + preconditioner_options.context = &context; + preconditioner_options.num_threads = state.range(0); + context.EnsureMinimumThreads(preconditioner_options.num_threads); + BlockSparseJacobiPreconditioner p(preconditioner_options, *jacobian); + Vector d = Vector::Ones(jacobian->num_cols()); for (auto _ : state) { p.Update(*jacobian, d.data()); } } -BENCHMARK(BM_BlockSparseJacobiPreconditionerUnstructured); +BENCHMARK(BM_BlockSparseJacobiPreconditionerUnstructured) + ->Arg(1) + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16); static void BM_BlockCRSJacobiPreconditionerUnstructured( benchmark::State& state) { @@ -123,13 +157,24 @@ CompressedRowSparseMatrix jacobian_crs( jacobian->num_rows(), jacobian->num_cols(), jacobian->num_nonzeros()); jacobian->ToCompressedRowSparseMatrix(&jacobian_crs); - BlockCRSJacobiPreconditioner p(jacobian_crs); + Preconditioner::Options preconditioner_options; + ContextImpl context; + preconditioner_options.context = &context; + preconditioner_options.num_threads = state.range(0); + context.EnsureMinimumThreads(preconditioner_options.num_threads); + BlockCRSJacobiPreconditioner p(preconditioner_options, jacobian_crs); + Vector d = Vector::Ones(jacobian_crs.num_cols()); for (auto _ : state) { p.Update(jacobian_crs, d.data()); } } -BENCHMARK(BM_BlockCRSJacobiPreconditionerUnstructured); +BENCHMARK(BM_BlockCRSJacobiPreconditionerUnstructured) + ->Arg(1) + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16); } // namespace ceres::internal
diff --git a/internal/ceres/block_jacobi_preconditioner_test.cc b/internal/ceres/block_jacobi_preconditioner_test.cc index fa051e4..eb3c9d0 100644 --- a/internal/ceres/block_jacobi_preconditioner_test.cc +++ b/internal/ceres/block_jacobi_preconditioner_test.cc
@@ -55,6 +55,10 @@ options.block_density = 0.25; std::mt19937 prng; + Preconditioner::Options preconditioner_options; + ContextImpl context; + preconditioner_options.context = &context; + for (int trial = 0; trial < kNumtrials; ++trial) { auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng); Vector diagonal = Vector::Ones(jacobian->num_cols()); @@ -63,7 +67,7 @@ Matrix hessian = dense_jacobian.transpose() * dense_jacobian; hessian.diagonal() += diagonal.array().square().matrix(); - BlockSparseJacobiPreconditioner pre(*jacobian); + BlockSparseJacobiPreconditioner pre(preconditioner_options, *jacobian); pre.Update(*jacobian, diagonal.data()); // The const_cast is needed to be able to call GetCell. @@ -104,6 +108,10 @@ options.block_density = 0.25; std::mt19937 prng; + Preconditioner::Options preconditioner_options; + ContextImpl context; + preconditioner_options.context = &context; + for (int trial = 0; trial < kNumtrials; ++trial) { auto jacobian = CompressedRowSparseMatrix::CreateRandomMatrix(options, prng); @@ -114,7 +122,7 @@ Matrix hessian = dense_jacobian.transpose() * dense_jacobian; hessian.diagonal() += diagonal.array().square().matrix(); - BlockCRSJacobiPreconditioner pre(*jacobian); + BlockCRSJacobiPreconditioner pre(preconditioner_options, *jacobian); pre.Update(*jacobian, diagonal.data()); auto& m = pre.matrix();
diff --git a/internal/ceres/cgnr_solver.cc b/internal/ceres/cgnr_solver.cc index bed4ad0..2215ab9 100644 --- a/internal/ceres/cgnr_solver.cc +++ b/internal/ceres/cgnr_solver.cc
@@ -135,18 +135,20 @@ double* x) { EventLogger event_logger("CgnrSolver::Solve"); if (!preconditioner_) { + Preconditioner::Options preconditioner_options; + preconditioner_options.type = options_.preconditioner_type; + preconditioner_options.subset_preconditioner_start_row_block = + options_.subset_preconditioner_start_row_block; + preconditioner_options.sparse_linear_algebra_library_type = + options_.sparse_linear_algebra_library_type; + preconditioner_options.ordering_type = options_.ordering_type; + preconditioner_options.num_threads = options_.num_threads; + preconditioner_options.context = options_.context; + if (options_.preconditioner_type == JACOBI) { - preconditioner_ = std::make_unique<BlockSparseJacobiPreconditioner>(*A); + preconditioner_ = std::make_unique<BlockSparseJacobiPreconditioner>( + preconditioner_options, *A); } else if (options_.preconditioner_type == SUBSET) { - Preconditioner::Options preconditioner_options; - preconditioner_options.type = SUBSET; - preconditioner_options.subset_preconditioner_start_row_block = - options_.subset_preconditioner_start_row_block; - preconditioner_options.sparse_linear_algebra_library_type = - options_.sparse_linear_algebra_library_type; - preconditioner_options.ordering_type = options_.ordering_type; - preconditioner_options.num_threads = options_.num_threads; - preconditioner_options.context = options_.context; preconditioner_ = std::make_unique<SubsetPreconditioner>(preconditioner_options, *A); } else { @@ -238,9 +240,11 @@ class CERES_NO_EXPORT CudaJacobiPreconditioner final : public CudaPreconditioner { public: - explicit CudaJacobiPreconditioner(ContextImpl* context, + explicit CudaJacobiPreconditioner(Preconditioner::Options options, const CompressedRowSparseMatrix& A) - : cpu_preconditioner_(A), m_(context, cpu_preconditioner_.matrix()) {} + : options_(std::move(options)), + cpu_preconditioner_(options_, A), + m_(options_->context, cpu_preconditioner_.matrix()) {} ~CudaJacobiPreconditioner() = default; void Update(const CompressedRowSparseMatrix& A, const double* D) final { @@ -253,6 +257,7 @@ } private: + Preconditioner::Options options_; BlockCRSJacobiPreconditioner cpu_preconditioner_; CudaSparseMatrix m_; }; @@ -297,9 +302,20 @@ Atb_ = std::make_unique<CudaVector>(options_.context, A.num_cols()); Ax_ = std::make_unique<CudaVector>(options_.context, A.num_rows()); D_ = std::make_unique<CudaVector>(options_.context, A.num_cols()); + + Preconditioner::Options preconditioner_options; + preconditioner_options.type = options_.preconditioner_type; + preconditioner_options.subset_preconditioner_start_row_block = + options_.subset_preconditioner_start_row_block; + preconditioner_options.sparse_linear_algebra_library_type = + options_.sparse_linear_algebra_library_type; + preconditioner_options.ordering_type = options_.ordering_type; + preconditioner_options.num_threads = options_.num_threads; + preconditioner_options.context = options_.context; + if (options_.preconditioner_type == JACOBI) { preconditioner_ = - std::make_unique<CudaJacobiPreconditioner>(options_.context, A); + std::make_unique<CudaJacobiPreconditioner>(preconditioner_options, A); } else { preconditioner_ = std::make_unique<CudaIdentityPreconditioner>(); }