Parallelize block_jacobi_preconditioner
Use ParallelFor to parallelize both versions of the
block Jacobi preconditioner. Also add benchmarks for
varying number of threads.
Benchmark on M1 Mac Pro
Before:
-----------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------------------------------------------
BM_BlockSparseJacobiPreconditionerBA 44847927 ns 44788313 ns 16
BM_BlockCRSJacobiPreconditionerBA 48772330 ns 48723571 ns 14
BM_BlockSparseJacobiPreconditionerUnstructured 62385231 ns 62306818 ns 11
BM_BlockCRSJacobiPreconditionerUnstructured 60671473 ns 60577727 ns 11
After:
--------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------------------------------------------
BM_BlockSparseJacobiPreconditionerBA/1 53314862 ns 53302308 ns 13
BM_BlockSparseJacobiPreconditionerBA/2 33601214 ns 33295143 ns 21
BM_BlockSparseJacobiPreconditionerBA/4 28162794 ns 27224167 ns 30
BM_BlockSparseJacobiPreconditionerBA/8 31402448 ns 28038760 ns 25
BM_BlockSparseJacobiPreconditionerBA/16 30820813 ns 22625233 ns 30
BM_BlockCRSJacobiPreconditionerBA/1 60348194 ns 60332167 ns 12
BM_BlockCRSJacobiPreconditionerBA/2 35489954 ns 34782050 ns 20
BM_BlockCRSJacobiPreconditionerBA/4 23636360 ns 22547032 ns 31
BM_BlockCRSJacobiPreconditionerBA/8 31688798 ns 27857800 ns 25
BM_BlockCRSJacobiPreconditionerBA/16 30806695 ns 20562516 ns 31
BM_BlockSparseJacobiPreconditionerUnstructured/1 59793396 ns 59788583 ns 12
BM_BlockSparseJacobiPreconditionerUnstructured/2 35192900 ns 34968900 ns 20
BM_BlockSparseJacobiPreconditionerUnstructured/4 30171145 ns 28924480 ns 25
BM_BlockSparseJacobiPreconditionerUnstructured/8 24982583 ns 23193172 ns 29
BM_BlockSparseJacobiPreconditionerUnstructured/16 23370546 ns 18389694 ns 36
BM_BlockCRSJacobiPreconditionerUnstructured/1 63204538 ns 63204545 ns 11
BM_BlockCRSJacobiPreconditionerUnstructured/2 34466060 ns 34193429 ns 21
BM_BlockCRSJacobiPreconditionerUnstructured/4 22712230 ns 20491147 ns 34
BM_BlockCRSJacobiPreconditionerUnstructured/8 16701833 ns 16190395 ns 43
BM_BlockCRSJacobiPreconditionerUnstructured/16 16762565 ns 12857304 ns 56
Note that single threaded performance gets worse. Performance goes up for 2 and 4 threads
and then essentially stalls.
Change-Id: I96a5d2f719545e14c03d73e71c8c0564e8c1c729
diff --git a/internal/ceres/block_jacobi_preconditioner.cc b/internal/ceres/block_jacobi_preconditioner.cc
index b452e7d..b7ee002 100644
--- a/internal/ceres/block_jacobi_preconditioner.cc
+++ b/internal/ceres/block_jacobi_preconditioner.cc
@@ -39,12 +39,14 @@
#include "ceres/block_structure.h"
#include "ceres/casts.h"
#include "ceres/internal/eigen.h"
+#include "ceres/parallel_for.h"
#include "ceres/small_blas.h"
namespace ceres::internal {
BlockSparseJacobiPreconditioner::BlockSparseJacobiPreconditioner(
- const BlockSparseMatrix& A) {
+ Preconditioner::Options options, const BlockSparseMatrix& A)
+ : options_(std::move(options)) {
m_ = std::make_unique<BlockRandomAccessDiagonalMatrix>(
A.block_structure()->cols);
}
@@ -56,48 +58,63 @@
const CompressedRowBlockStructure* bs = A.block_structure();
const double* values = A.values();
m_->SetZero();
- for (int i = 0; i < bs->rows.size(); ++i) {
- const int row_block_size = bs->rows[i].block.size;
- const std::vector<Cell>& cells = bs->rows[i].cells;
- for (const auto& cell : cells) {
- const int block_id = cell.block_id;
- const int col_block_size = bs->cols[block_id].size;
- int r, c, row_stride, col_stride;
- CellInfo* cell_info =
- m_->GetCell(block_id, block_id, &r, &c, &row_stride, &col_stride);
- MatrixRef m(cell_info->values, row_stride, col_stride);
- ConstMatrixRef b(values + cell.position, row_block_size, col_block_size);
- // clang-format off
- MatrixTransposeMatrixMultiply<Eigen::Dynamic, Eigen::Dynamic,
- Eigen::Dynamic,Eigen::Dynamic, 1>(
- values + cell.position, row_block_size,col_block_size,
- values + cell.position, row_block_size,col_block_size,
- cell_info->values,r, c,row_stride,col_stride);
- // clang-format on
- }
- }
+ ParallelFor(options_.context,
+ 0,
+ bs->rows.size(),
+ options_.num_threads,
+ [this, bs, values](int i) {
+ const int row_block_size = bs->rows[i].block.size;
+ const std::vector<Cell>& cells = bs->rows[i].cells;
+ for (const auto& cell : cells) {
+ const int block_id = cell.block_id;
+ const int col_block_size = bs->cols[block_id].size;
+ int r, c, row_stride, col_stride;
+ CellInfo* cell_info = m_->GetCell(
+ block_id, block_id, &r, &c, &row_stride, &col_stride);
+ MatrixRef m(cell_info->values, row_stride, col_stride);
+ ConstMatrixRef b(
+ values + cell.position, row_block_size, col_block_size);
+ std::lock_guard<std::mutex> l(cell_info->m);
+ // clang-format off
+ MatrixTransposeMatrixMultiply<Eigen::Dynamic, Eigen::Dynamic,
+ Eigen::Dynamic,Eigen::Dynamic, 1>(
+ values + cell.position, row_block_size,col_block_size,
+ values + cell.position, row_block_size,col_block_size,
+ cell_info->values,r, c,row_stride,col_stride);
+ // clang-format on
+ }
+ });
if (D != nullptr) {
// Add the diagonal.
- int position = 0;
- for (int i = 0; i < bs->cols.size(); ++i) {
- const int block_size = bs->cols[i].size;
- int r, c, row_stride, col_stride;
- CellInfo* cell_info = m_->GetCell(i, i, &r, &c, &row_stride, &col_stride);
- MatrixRef m(cell_info->values, row_stride, col_stride);
- m.block(r, c, block_size, block_size).diagonal() +=
- ConstVectorRef(D + position, block_size).array().square().matrix();
- position += block_size;
- }
+ ParallelFor(options_.context,
+ 0,
+ bs->cols.size(),
+ options_.num_threads,
+ [this, bs, D](int i) {
+ const int block_size = bs->cols[i].size;
+ int r, c, row_stride, col_stride;
+ CellInfo* cell_info =
+ m_->GetCell(i, i, &r, &c, &row_stride, &col_stride);
+ MatrixRef m(cell_info->values, row_stride, col_stride);
+ m.block(r, c, block_size, block_size).diagonal() +=
+ ConstVectorRef(D + bs->cols[i].position, block_size)
+ .array()
+ .square()
+ .matrix();
+ });
}
+ // TODO(sameeragarwal): Once matrices are threaded, this call to invert should
+ // also be parallelized.
m_->Invert();
return true;
}
BlockCRSJacobiPreconditioner::BlockCRSJacobiPreconditioner(
- const CompressedRowSparseMatrix& A) {
+ Preconditioner::Options options, const CompressedRowSparseMatrix& A)
+ : options_(std::move(options)), locks_(A.col_blocks().size()) {
auto& col_blocks = A.col_blocks();
// Compute the number of non-zeros in the preconditioner. This is needed so
@@ -126,6 +143,12 @@
}
}
+ // In reality we only need num_col_blocks locks, however that would require
+ // that in UpdateImpl we are able to look up the column block from the it
+ // first column. To save ourselves this map we will instead spend a few extra
+ // lock objects.
+ std::vector<std::mutex> locks(A.num_cols());
+ locks_.swap(locks);
CHECK_EQ(m_rows[A.num_cols()], m_nnz);
}
@@ -141,49 +164,60 @@
const int* a_rows = A.rows();
const int* a_cols = A.cols();
const double* a_values = A.values();
-
- m_->SetZero();
double* m_values = m_->mutable_values();
const int* m_rows = m_->rows();
- for (int i = 0; i < num_row_blocks; ++i) {
- const int row = row_blocks[i].position;
- const int row_block_size = row_blocks[i].size;
- const int row_nnz = a_rows[row + 1] - a_rows[row];
- ConstMatrixRef row_block(a_values + a_rows[row], row_block_size, row_nnz);
- int c = 0;
- while (c < row_nnz) {
- const int idx = a_rows[row] + c;
- const int col = a_cols[idx];
- const int col_block_size = m_rows[col + 1] - m_rows[col];
+ m_->SetZero();
- // We make use of the fact that the entire diagonal block is stored
- // contiguously in memory as a row-major matrix.
- MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size);
+ ParallelFor(
+ options_.context,
+ 0,
+ num_row_blocks,
+ options_.num_threads,
+ [this, row_blocks, a_rows, a_cols, a_values, m_values, m_rows](int i) {
+ const int row = row_blocks[i].position;
+ const int row_block_size = row_blocks[i].size;
+ const int row_nnz = a_rows[row + 1] - a_rows[row];
+ ConstMatrixRef row_block(
+ a_values + a_rows[row], row_block_size, row_nnz);
+ int c = 0;
+ while (c < row_nnz) {
+ const int idx = a_rows[row] + c;
+ const int col = a_cols[idx];
+ const int col_block_size = m_rows[col + 1] - m_rows[col];
- // We do not have a row_stride version of MatrixTransposeMatrixMultiply,
- // otherwise we could use it here to further speed up the following
- // expression.
- auto b = row_block.middleCols(c, col_block_size);
- m.noalias() += b.transpose() * b;
- c += col_block_size;
- }
- }
+ // We make use of the fact that the entire diagonal block is
+ // stored contiguously in memory as a row-major matrix.
+ MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size);
+ // We do not have a row_stride version of
+ // MatrixTransposeMatrixMultiply, otherwise we could use it
+ // here to further speed up the following expression.
+ auto b = row_block.middleCols(c, col_block_size);
+ std::lock_guard<std::mutex> l(locks_[col]);
+ m.noalias() += b.transpose() * b;
+ c += col_block_size;
+ }
+ });
- for (int i = 0; i < num_col_blocks; ++i) {
- const int col = col_blocks[i].position;
- const int col_block_size = col_blocks[i].size;
- MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size);
+ ParallelFor(
+ options_.context,
+ 0,
+ num_col_blocks,
+ options_.num_threads,
+ [col_blocks, m_rows, m_values, D](int i) {
+ const int col = col_blocks[i].position;
+ const int col_block_size = col_blocks[i].size;
+ MatrixRef m(m_values + m_rows[col], col_block_size, col_block_size);
- if (D != nullptr) {
- m.diagonal() +=
- ConstVectorRef(D + col, col_block_size).array().square().matrix();
- }
+ if (D != nullptr) {
+ m.diagonal() +=
+ ConstVectorRef(D + col, col_block_size).array().square().matrix();
+ }
- // TODO(sameeragarwal): Deal with Cholesky inversion failure here and
- // elsewhere.
- m = m.llt().solve(Matrix::Identity(col_block_size, col_block_size));
- }
+ // TODO(sameeragarwal): Deal with Cholesky inversion failure here and
+ // elsewhere.
+ m = m.llt().solve(Matrix::Identity(col_block_size, col_block_size));
+ });
return true;
}
diff --git a/internal/ceres/block_jacobi_preconditioner.h b/internal/ceres/block_jacobi_preconditioner.h
index 265b7d6..a39ad12 100644
--- a/internal/ceres/block_jacobi_preconditioner.h
+++ b/internal/ceres/block_jacobi_preconditioner.h
@@ -52,7 +52,8 @@
: public BlockSparseMatrixPreconditioner {
public:
// A must remain valid while the BlockJacobiPreconditioner is.
- explicit BlockSparseJacobiPreconditioner(const BlockSparseMatrix& A);
+ explicit BlockSparseJacobiPreconditioner(Preconditioner::Options,
+ const BlockSparseMatrix& A);
~BlockSparseJacobiPreconditioner() override;
void RightMultiplyAndAccumulate(const double* x, double* y) const final {
return m_->RightMultiplyAndAccumulate(x, y);
@@ -64,6 +65,7 @@
private:
bool UpdateImpl(const BlockSparseMatrix& A, const double* D) final;
+ Preconditioner::Options options_;
std::unique_ptr<BlockRandomAccessDiagonalMatrix> m_;
};
@@ -73,7 +75,8 @@
: public CompressedRowSparseMatrixPreconditioner {
public:
// A must remain valid while the BlockJacobiPreconditioner is.
- explicit BlockCRSJacobiPreconditioner(const CompressedRowSparseMatrix& A);
+ explicit BlockCRSJacobiPreconditioner(Preconditioner::Options options,
+ const CompressedRowSparseMatrix& A);
~BlockCRSJacobiPreconditioner() override;
void RightMultiplyAndAccumulate(const double* x, double* y) const final {
m_->RightMultiplyAndAccumulate(x, y);
@@ -85,6 +88,8 @@
private:
bool UpdateImpl(const CompressedRowSparseMatrix& A, const double* D) final;
+ Preconditioner::Options options_;
+ std::vector<std::mutex> locks_;
std::unique_ptr<CompressedRowSparseMatrix> m_;
};
diff --git a/internal/ceres/block_jacobi_preconditioner_benchmark.cc b/internal/ceres/block_jacobi_preconditioner_benchmark.cc
index 1d8ad10..d2508c4 100644
--- a/internal/ceres/block_jacobi_preconditioner_benchmark.cc
+++ b/internal/ceres/block_jacobi_preconditioner_benchmark.cc
@@ -59,14 +59,26 @@
std::mt19937 prng;
auto jacobian = CreateFakeBundleAdjustmentJacobian(
kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
- BlockSparseJacobiPreconditioner p(*jacobian);
+
+ Preconditioner::Options preconditioner_options;
+ ContextImpl context;
+ preconditioner_options.context = &context;
+ preconditioner_options.num_threads = state.range(0);
+ context.EnsureMinimumThreads(preconditioner_options.num_threads);
+ BlockSparseJacobiPreconditioner p(preconditioner_options, *jacobian);
+
Vector d = Vector::Ones(jacobian->num_cols());
for (auto _ : state) {
p.Update(*jacobian, d.data());
}
}
-BENCHMARK(BM_BlockSparseJacobiPreconditionerBA);
+BENCHMARK(BM_BlockSparseJacobiPreconditionerBA)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
static void BM_BlockCRSJacobiPreconditionerBA(benchmark::State& state) {
std::mt19937 prng;
@@ -76,14 +88,25 @@
CompressedRowSparseMatrix jacobian_crs(
jacobian->num_rows(), jacobian->num_cols(), jacobian->num_nonzeros());
jacobian->ToCompressedRowSparseMatrix(&jacobian_crs);
- BlockCRSJacobiPreconditioner p(jacobian_crs);
+ Preconditioner::Options preconditioner_options;
+ ContextImpl context;
+ preconditioner_options.context = &context;
+ preconditioner_options.num_threads = state.range(0);
+ context.EnsureMinimumThreads(preconditioner_options.num_threads);
+ BlockCRSJacobiPreconditioner p(preconditioner_options, jacobian_crs);
+
Vector d = Vector::Ones(jacobian_crs.num_cols());
for (auto _ : state) {
p.Update(jacobian_crs, d.data());
}
}
-BENCHMARK(BM_BlockCRSJacobiPreconditionerBA);
+BENCHMARK(BM_BlockCRSJacobiPreconditionerBA)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
static void BM_BlockSparseJacobiPreconditionerUnstructured(
benchmark::State& state) {
@@ -98,14 +121,25 @@
std::mt19937 prng;
auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
- BlockSparseJacobiPreconditioner p(*jacobian);
+ Preconditioner::Options preconditioner_options;
+ ContextImpl context;
+ preconditioner_options.context = &context;
+ preconditioner_options.num_threads = state.range(0);
+ context.EnsureMinimumThreads(preconditioner_options.num_threads);
+ BlockSparseJacobiPreconditioner p(preconditioner_options, *jacobian);
+
Vector d = Vector::Ones(jacobian->num_cols());
for (auto _ : state) {
p.Update(*jacobian, d.data());
}
}
-BENCHMARK(BM_BlockSparseJacobiPreconditionerUnstructured);
+BENCHMARK(BM_BlockSparseJacobiPreconditionerUnstructured)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
static void BM_BlockCRSJacobiPreconditionerUnstructured(
benchmark::State& state) {
@@ -123,13 +157,24 @@
CompressedRowSparseMatrix jacobian_crs(
jacobian->num_rows(), jacobian->num_cols(), jacobian->num_nonzeros());
jacobian->ToCompressedRowSparseMatrix(&jacobian_crs);
- BlockCRSJacobiPreconditioner p(jacobian_crs);
+ Preconditioner::Options preconditioner_options;
+ ContextImpl context;
+ preconditioner_options.context = &context;
+ preconditioner_options.num_threads = state.range(0);
+ context.EnsureMinimumThreads(preconditioner_options.num_threads);
+ BlockCRSJacobiPreconditioner p(preconditioner_options, jacobian_crs);
+
Vector d = Vector::Ones(jacobian_crs.num_cols());
for (auto _ : state) {
p.Update(jacobian_crs, d.data());
}
}
-BENCHMARK(BM_BlockCRSJacobiPreconditionerUnstructured);
+BENCHMARK(BM_BlockCRSJacobiPreconditionerUnstructured)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
} // namespace ceres::internal
diff --git a/internal/ceres/block_jacobi_preconditioner_test.cc b/internal/ceres/block_jacobi_preconditioner_test.cc
index fa051e4..eb3c9d0 100644
--- a/internal/ceres/block_jacobi_preconditioner_test.cc
+++ b/internal/ceres/block_jacobi_preconditioner_test.cc
@@ -55,6 +55,10 @@
options.block_density = 0.25;
std::mt19937 prng;
+ Preconditioner::Options preconditioner_options;
+ ContextImpl context;
+ preconditioner_options.context = &context;
+
for (int trial = 0; trial < kNumtrials; ++trial) {
auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
Vector diagonal = Vector::Ones(jacobian->num_cols());
@@ -63,7 +67,7 @@
Matrix hessian = dense_jacobian.transpose() * dense_jacobian;
hessian.diagonal() += diagonal.array().square().matrix();
- BlockSparseJacobiPreconditioner pre(*jacobian);
+ BlockSparseJacobiPreconditioner pre(preconditioner_options, *jacobian);
pre.Update(*jacobian, diagonal.data());
// The const_cast is needed to be able to call GetCell.
@@ -104,6 +108,10 @@
options.block_density = 0.25;
std::mt19937 prng;
+ Preconditioner::Options preconditioner_options;
+ ContextImpl context;
+ preconditioner_options.context = &context;
+
for (int trial = 0; trial < kNumtrials; ++trial) {
auto jacobian =
CompressedRowSparseMatrix::CreateRandomMatrix(options, prng);
@@ -114,7 +122,7 @@
Matrix hessian = dense_jacobian.transpose() * dense_jacobian;
hessian.diagonal() += diagonal.array().square().matrix();
- BlockCRSJacobiPreconditioner pre(*jacobian);
+ BlockCRSJacobiPreconditioner pre(preconditioner_options, *jacobian);
pre.Update(*jacobian, diagonal.data());
auto& m = pre.matrix();
diff --git a/internal/ceres/cgnr_solver.cc b/internal/ceres/cgnr_solver.cc
index bed4ad0..2215ab9 100644
--- a/internal/ceres/cgnr_solver.cc
+++ b/internal/ceres/cgnr_solver.cc
@@ -135,18 +135,20 @@
double* x) {
EventLogger event_logger("CgnrSolver::Solve");
if (!preconditioner_) {
+ Preconditioner::Options preconditioner_options;
+ preconditioner_options.type = options_.preconditioner_type;
+ preconditioner_options.subset_preconditioner_start_row_block =
+ options_.subset_preconditioner_start_row_block;
+ preconditioner_options.sparse_linear_algebra_library_type =
+ options_.sparse_linear_algebra_library_type;
+ preconditioner_options.ordering_type = options_.ordering_type;
+ preconditioner_options.num_threads = options_.num_threads;
+ preconditioner_options.context = options_.context;
+
if (options_.preconditioner_type == JACOBI) {
- preconditioner_ = std::make_unique<BlockSparseJacobiPreconditioner>(*A);
+ preconditioner_ = std::make_unique<BlockSparseJacobiPreconditioner>(
+ preconditioner_options, *A);
} else if (options_.preconditioner_type == SUBSET) {
- Preconditioner::Options preconditioner_options;
- preconditioner_options.type = SUBSET;
- preconditioner_options.subset_preconditioner_start_row_block =
- options_.subset_preconditioner_start_row_block;
- preconditioner_options.sparse_linear_algebra_library_type =
- options_.sparse_linear_algebra_library_type;
- preconditioner_options.ordering_type = options_.ordering_type;
- preconditioner_options.num_threads = options_.num_threads;
- preconditioner_options.context = options_.context;
preconditioner_ =
std::make_unique<SubsetPreconditioner>(preconditioner_options, *A);
} else {
@@ -238,9 +240,11 @@
class CERES_NO_EXPORT CudaJacobiPreconditioner final
: public CudaPreconditioner {
public:
- explicit CudaJacobiPreconditioner(ContextImpl* context,
+ explicit CudaJacobiPreconditioner(Preconditioner::Options options,
const CompressedRowSparseMatrix& A)
- : cpu_preconditioner_(A), m_(context, cpu_preconditioner_.matrix()) {}
+ : options_(std::move(options)),
+ cpu_preconditioner_(options_, A),
+ m_(options_->context, cpu_preconditioner_.matrix()) {}
~CudaJacobiPreconditioner() = default;
void Update(const CompressedRowSparseMatrix& A, const double* D) final {
@@ -253,6 +257,7 @@
}
private:
+ Preconditioner::Options options_;
BlockCRSJacobiPreconditioner cpu_preconditioner_;
CudaSparseMatrix m_;
};
@@ -297,9 +302,20 @@
Atb_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
Ax_ = std::make_unique<CudaVector>(options_.context, A.num_rows());
D_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
+
+ Preconditioner::Options preconditioner_options;
+ preconditioner_options.type = options_.preconditioner_type;
+ preconditioner_options.subset_preconditioner_start_row_block =
+ options_.subset_preconditioner_start_row_block;
+ preconditioner_options.sparse_linear_algebra_library_type =
+ options_.sparse_linear_algebra_library_type;
+ preconditioner_options.ordering_type = options_.ordering_type;
+ preconditioner_options.num_threads = options_.num_threads;
+ preconditioner_options.context = options_.context;
+
if (options_.preconditioner_type == JACOBI) {
preconditioner_ =
- std::make_unique<CudaJacobiPreconditioner>(options_.context, A);
+ std::make_unique<CudaJacobiPreconditioner>(preconditioner_options, A);
} else {
preconditioner_ = std::make_unique<CudaIdentityPreconditioner>();
}