Parallel right products for partitioned view
Parallel implementations for right-multiply by dense vector for:
- Partitioned matrix view
- Block-sparse matrix
- CRS matrix (non-symmetric only)
When coupled with non-interleaving indexes in parallel for, this
simple aproach provides a reasonable speedup.
For example, in CRS case difference with GPGPU approach reduces
closer to memory throughput ratio for high enough core count.
./bin/spmv_benchmark
-------------------------------------------------------------------
Benchmark Time
-------------------------------------------------------------------
BM_BlockSparseRightMultiplyAndAccumulateBA/1 28.5 ms
BM_BlockSparseRightMultiplyAndAccumulateBA/2 15.7 ms
BM_BlockSparseRightMultiplyAndAccumulateBA/4 9.01 ms
BM_BlockSparseRightMultiplyAndAccumulateBA/8 5.60 ms
BM_BlockSparseRightMultiplyAndAccumulateBA/16 3.86 ms
BM_BlockSparseRightMultiplyAndAccumulateBA/28 3.84 ms
BM_BlockSparseRightMultiplyAndAccumulateUnstructured/1 23.8 ms
BM_BlockSparseRightMultiplyAndAccumulateUnstructured/2 15.0 ms
BM_BlockSparseRightMultiplyAndAccumulateUnstructured/4 8.01 ms
BM_BlockSparseRightMultiplyAndAccumulateUnstructured/8 4.02 ms
BM_BlockSparseRightMultiplyAndAccumulateUnstructured/16 2.39 ms
BM_BlockSparseRightMultiplyAndAccumulateUnstructured/28 1.68 ms
BM_BlockSparseLeftMultiplyAndAccumulateBA 30.7 ms
BM_BlockSparseLeftMultiplyAndAccumulateUnstructured 41.5 ms
BM_CRSRightMultiplyAndAccumulateBA/1 24.1 ms
BM_CRSRightMultiplyAndAccumulateBA/2 13.6 ms
BM_CRSRightMultiplyAndAccumulateBA/4 8.70 ms
BM_CRSRightMultiplyAndAccumulateBA/8 5.34 ms
BM_CRSRightMultiplyAndAccumulateBA/16 3.99 ms
BM_CRSRightMultiplyAndAccumulateBA/28 4.00 ms
BM_CRSRightMultiplyAndAccumulateUnstructured/1 21.1 ms
BM_CRSRightMultiplyAndAccumulateUnstructured/2 10.83 ms
BM_CRSRightMultiplyAndAccumulateUnstructured/4 5.88 ms
BM_CRSRightMultiplyAndAccumulateUnstructured/8 3.68 ms
BM_CRSRightMultiplyAndAccumulateUnstructured/16 2.21 ms
BM_CRSRightMultiplyAndAccumulateUnstructured/28 1.71 ms
BM_CRSLeftMultiplyAndAccumulateBA 23.6 ms
BM_CRSLeftMultiplyAndAccumulateUnstructured 22.5 ms
BM_CudaRightMultiplyAndAccumulateBA 0.679 ms
BM_CudaRightMultiplyAndAccumulateUnstructured 0.480 ms
BM_CudaLeftMultiplyAndAccumulateBA 0.774 ms
BM_CudaLeftMultiplyAndAccumulateUnstructured 0.361 ms
./bin/partitioned_matrix_view_benchmark
-----------------------------------------------------------------
Benchmark Time
-----------------------------------------------------------------
BM_PatitionedViewRightMultiplyAndAccumulateE_Static/1 18.5 ms
BM_PatitionedViewRightMultiplyAndAccumulateE_Static/2 10.7 ms
BM_PatitionedViewRightMultiplyAndAccumulateE_Static/4 6.34 ms
BM_PatitionedViewRightMultiplyAndAccumulateE_Static/8 4.26 ms
BM_PatitionedViewRightMultiplyAndAccumulateE_Static/16 3.86 ms
BM_PatitionedViewRightMultiplyAndAccumulateE_Static/28 3.75 ms
BM_PatitionedViewRightMultiplyAndAccumulateF_Static/1 18.8 ms
BM_PatitionedViewRightMultiplyAndAccumulateF_Static/2 11.9 ms
BM_PatitionedViewRightMultiplyAndAccumulateF_Static/4 6.94 ms
BM_PatitionedViewRightMultiplyAndAccumulateF_Static/8 4.41 ms
BM_PatitionedViewRightMultiplyAndAccumulateF_Static/16 3.63 ms
BM_PatitionedViewRightMultiplyAndAccumulateF_Static/28 3.86 ms
Timings correspond to intel 8176 cpu and 2080ti nvidia gpu,
with OpenMP threading backend.
Change-Id: Idc07d0563103d057ca3c8412de81a7823fe232af
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index fef1c2d..350b3b5 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -598,6 +598,9 @@
add_executable(spmv_benchmark spmv_benchmark.cc)
add_dependencies_to_benchmark(spmv_benchmark)
+ add_executable(partitioned_matrix_view_benchmark partitioned_matrix_view_benchmark.cc)
+ add_dependencies_to_benchmark(partitioned_matrix_view_benchmark)
+
add_executable(block_jacobi_preconditioner_benchmark
block_jacobi_preconditioner_benchmark.cc)
add_dependencies_to_benchmark(block_jacobi_preconditioner_benchmark)
diff --git a/internal/ceres/block_sparse_matrix.cc b/internal/ceres/block_sparse_matrix.cc
index 0401696..0dd3b20 100644
--- a/internal/ceres/block_sparse_matrix.cc
+++ b/internal/ceres/block_sparse_matrix.cc
@@ -39,6 +39,7 @@
#include "ceres/block_structure.h"
#include "ceres/crs_matrix.h"
#include "ceres/internal/eigen.h"
+#include "ceres/parallel_for.h"
#include "ceres/small_blas.h"
#include "ceres/triplet_sparse_matrix.h"
#include "glog/logging.h"
@@ -88,25 +89,44 @@
void BlockSparseMatrix::RightMultiplyAndAccumulate(const double* x,
double* y) const {
+ RightMultiplyAndAccumulate(x, y, nullptr, 1);
+}
+
+void BlockSparseMatrix::RightMultiplyAndAccumulate(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const {
CHECK(x != nullptr);
CHECK(y != nullptr);
- for (int i = 0; i < block_structure_->rows.size(); ++i) {
- int row_block_pos = block_structure_->rows[i].block.position;
- int row_block_size = block_structure_->rows[i].block.size;
- const std::vector<Cell>& cells = block_structure_->rows[i].cells;
- for (const auto& cell : cells) {
- int col_block_id = cell.block_id;
- int col_block_size = block_structure_->cols[col_block_id].size;
- int col_block_pos = block_structure_->cols[col_block_id].position;
- MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
- values_.get() + cell.position,
- row_block_size,
- col_block_size,
- x + col_block_pos,
- y + row_block_pos);
- }
- }
+ const auto values = values_.get();
+ const auto block_structure = block_structure_.get();
+ const auto num_row_blocks = block_structure->rows.size();
+
+ ParallelFor(context,
+ 0,
+ num_row_blocks,
+ num_threads,
+ [values, block_structure, x, y](int row_block_id) {
+ const int row_block_pos =
+ block_structure->rows[row_block_id].block.position;
+ const int row_block_size =
+ block_structure->rows[row_block_id].block.size;
+ const auto& cells = block_structure->rows[row_block_id].cells;
+ for (const auto& cell : cells) {
+ const int col_block_id = cell.block_id;
+ const int col_block_size =
+ block_structure->cols[col_block_id].size;
+ const int col_block_pos =
+ block_structure->cols[col_block_id].position;
+ MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+ values + cell.position,
+ row_block_size,
+ col_block_size,
+ x + col_block_pos,
+ y + row_block_pos);
+ }
+ });
}
void BlockSparseMatrix::LeftMultiplyAndAccumulate(const double* x,
diff --git a/internal/ceres/block_sparse_matrix.h b/internal/ceres/block_sparse_matrix.h
index b7ea1df..3a8dac9 100644
--- a/internal/ceres/block_sparse_matrix.h
+++ b/internal/ceres/block_sparse_matrix.h
@@ -39,6 +39,7 @@
#include "ceres/block_structure.h"
#include "ceres/compressed_row_sparse_matrix.h"
+#include "ceres/context_impl.h"
#include "ceres/internal/disable_warnings.h"
#include "ceres/internal/eigen.h"
#include "ceres/internal/export.h"
@@ -66,13 +67,16 @@
// CompressedRowBlockStructure objects.
explicit BlockSparseMatrix(CompressedRowBlockStructure* block_structure);
- BlockSparseMatrix();
BlockSparseMatrix(const BlockSparseMatrix&) = delete;
void operator=(const BlockSparseMatrix&) = delete;
// Implementation of SparseMatrix interface.
void SetZero() final;
void RightMultiplyAndAccumulate(const double* x, double* y) const final;
+ void RightMultiplyAndAccumulate(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const final;
void LeftMultiplyAndAccumulate(const double* x, double* y) const final;
void SquaredColumnNorm(double* x) const final;
void ScaleColumns(const double* scale) final;
diff --git a/internal/ceres/block_sparse_matrix_test.cc b/internal/ceres/block_sparse_matrix_test.cc
index 7d7d4c9..3f5c78a 100644
--- a/internal/ceres/block_sparse_matrix_test.cc
+++ b/internal/ceres/block_sparse_matrix_test.cc
@@ -115,6 +115,8 @@
}
} // namespace
+const int kNumThreads = 4;
+
class BlockSparseMatrixTest : public ::testing::Test {
protected:
void SetUp() final {
@@ -130,10 +132,12 @@
CHECK_EQ(A_->num_rows(), B_->num_rows());
CHECK_EQ(A_->num_cols(), B_->num_cols());
CHECK_EQ(A_->num_nonzeros(), B_->num_nonzeros());
+ context_.EnsureMinimumThreads(kNumThreads);
}
std::unique_ptr<BlockSparseMatrix> A_;
std::unique_ptr<TripletSparseMatrix> B_;
+ ContextImpl context_;
};
TEST_F(BlockSparseMatrixTest, SetZeroTest) {
@@ -153,6 +157,20 @@
}
}
+TEST_F(BlockSparseMatrixTest, RightMultiplyAndAccumulateParallelTest) {
+ Vector y_0 = Vector::Random(A_->num_rows());
+ Vector y_s = y_0;
+ Vector y_p = y_0;
+
+ Vector x = Vector::Random(A_->num_cols());
+ A_->RightMultiplyAndAccumulate(x.data(), y_s.data());
+
+ A_->RightMultiplyAndAccumulate(x.data(), y_p.data(), &context_, kNumThreads);
+
+ // Current parallel implementation is expected to be bit-exact
+ EXPECT_EQ((y_s - y_p).norm(), 0.);
+}
+
TEST_F(BlockSparseMatrixTest, LeftMultiplyAndAccumulateTest) {
Vector y_a = Vector::Zero(A_->num_cols());
Vector y_b = Vector::Zero(A_->num_cols());
diff --git a/internal/ceres/cgnr_solver.cc b/internal/ceres/cgnr_solver.cc
index 17a5f62..e1c571d 100644
--- a/internal/ceres/cgnr_solver.cc
+++ b/internal/ceres/cgnr_solver.cc
@@ -84,14 +84,21 @@
class CERES_NO_EXPORT CgnrLinearOperator final
: public ConjugateGradientsLinearOperator<Vector> {
public:
- CgnrLinearOperator(const LinearOperator& A, const double* D)
- : A_(A), D_(D), z_(Vector::Zero(A.num_rows())) {}
+ CgnrLinearOperator(const LinearOperator& A,
+ const double* D,
+ ContextImpl* context,
+ int num_threads)
+ : A_(A),
+ D_(D),
+ z_(Vector::Zero(A.num_rows())),
+ context_(context),
+ num_threads_(num_threads) {}
void RightMultiplyAndAccumulate(const Vector& x, Vector& y) final {
// z = Ax
// y = y + Atz
z_.setZero();
- A_.RightMultiplyAndAccumulate(x, z_);
+ A_.RightMultiplyAndAccumulate(x, z_, context_, num_threads_);
A_.LeftMultiplyAndAccumulate(z_, y);
// y = y + DtDx
@@ -105,6 +112,9 @@
const LinearOperator& A_;
const double* D_;
Vector z_;
+
+ ContextImpl* context_;
+ int num_threads_;
};
CgnrSolver::CgnrSolver(LinearSolver::Options options)
@@ -166,7 +176,8 @@
cg_options.r_tolerance = per_solve_options.r_tolerance;
// lhs = AtA + DtD
- CgnrLinearOperator lhs(*A, per_solve_options.D);
+ CgnrLinearOperator lhs(
+ *A, per_solve_options.D, options_.context, options_.num_threads);
// rhs = Atb.
Vector rhs(A->num_cols());
rhs.setZero();
diff --git a/internal/ceres/compressed_row_sparse_matrix.cc b/internal/ceres/compressed_row_sparse_matrix.cc
index 057adfd..553dc21 100644
--- a/internal/ceres/compressed_row_sparse_matrix.cc
+++ b/internal/ceres/compressed_row_sparse_matrix.cc
@@ -37,8 +37,10 @@
#include <random>
#include <vector>
+#include "ceres/context_impl.h"
#include "ceres/crs_matrix.h"
#include "ceres/internal/export.h"
+#include "ceres/parallel_for.h"
#include "ceres/triplet_sparse_matrix.h"
#include "glog/logging.h"
@@ -278,19 +280,34 @@
// TODO(sameeragarwal): Make RightMultiplyAndAccumulate and
// LeftMultiplyAndAccumulate block-aware for higher performance.
+void CompressedRowSparseMatrix::RightMultiplyAndAccumulate(
+ const double* x, double* y, ContextImpl* context, int num_threads) const {
+ if (storage_type_ != StorageType::UNSYMMETRIC) {
+ RightMultiplyAndAccumulate(x, y);
+ return;
+ }
+
+ auto values = values_.data();
+ auto rows = rows_.data();
+ auto cols = cols_.data();
+
+ ParallelFor(
+ context, 0, num_rows_, num_threads, [values, rows, cols, x, y](int row) {
+ for (int idx = rows[row]; idx < rows[row + 1]; ++idx) {
+ const int c = cols[idx];
+ const double v = values[idx];
+ y[row] += v * x[c];
+ }
+ });
+}
+
void CompressedRowSparseMatrix::RightMultiplyAndAccumulate(const double* x,
double* y) const {
CHECK(x != nullptr);
CHECK(y != nullptr);
if (storage_type_ == StorageType::UNSYMMETRIC) {
- for (int r = 0; r < num_rows_; ++r) {
- for (int idx = rows_[r]; idx < rows_[r + 1]; ++idx) {
- const int c = cols_[idx];
- const double v = values_[idx];
- y[r] += v * x[c];
- }
- }
+ RightMultiplyAndAccumulate(x, y, nullptr, 1);
} else if (storage_type_ == StorageType::UPPER_TRIANGULAR) {
// Because of their block structure, we will have entries that lie
// above (below) the diagonal for lower (upper) triangular matrices,
diff --git a/internal/ceres/compressed_row_sparse_matrix.h b/internal/ceres/compressed_row_sparse_matrix.h
index 8668d06..6fe730e 100644
--- a/internal/ceres/compressed_row_sparse_matrix.h
+++ b/internal/ceres/compressed_row_sparse_matrix.h
@@ -48,6 +48,7 @@
namespace internal {
+class ContextImpl;
class TripletSparseMatrix;
class CERES_NO_EXPORT CompressedRowSparseMatrix : public SparseMatrix {
@@ -103,6 +104,10 @@
~CompressedRowSparseMatrix() override;
void SetZero() final;
void RightMultiplyAndAccumulate(const double* x, double* y) const final;
+ void RightMultiplyAndAccumulate(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const final;
void LeftMultiplyAndAccumulate(const double* x, double* y) const final;
void SquaredColumnNorm(double* x) const final;
void ScaleColumns(const double* scale) final;
diff --git a/internal/ceres/compressed_row_sparse_matrix_test.cc b/internal/ceres/compressed_row_sparse_matrix_test.cc
index 82c982b..46136db 100644
--- a/internal/ceres/compressed_row_sparse_matrix_test.cc
+++ b/internal/ceres/compressed_row_sparse_matrix_test.cc
@@ -39,6 +39,7 @@
#include "Eigen/SparseCore"
#include "ceres/casts.h"
+#include "ceres/context_impl.h"
#include "ceres/crs_matrix.h"
#include "ceres/internal/eigen.h"
#include "ceres/linear_least_squares_problems.h"
@@ -599,6 +600,73 @@
CompressedRowSparseMatrix::StorageType::UNSYMMETRIC),
ParamInfoToString);
+const int kMaxNumThreads = 8;
+class CompressedRowSparseMatrixParallelTest
+ : public ::testing::TestWithParam<int> {
+ void SetUp() final { context_.EnsureMinimumThreads(kMaxNumThreads); }
+
+ protected:
+ ContextImpl context_;
+};
+
+TEST_P(CompressedRowSparseMatrixParallelTest,
+ RightMultiplyAndAccumulateUnsymmetric) {
+ const int kMinNumBlocks = 1;
+ const int kMaxNumBlocks = 10;
+ const int kMinBlockSize = 1;
+ const int kMaxBlockSize = 5;
+ const int kNumTrials = 10;
+ const int kNumThreads = GetParam();
+ std::mt19937 prng;
+ std::uniform_real_distribution<double> uniform(0.5, 1.0);
+ for (int num_blocks = kMinNumBlocks; num_blocks < kMaxNumBlocks;
+ ++num_blocks) {
+ for (int trial = 0; trial < kNumTrials; ++trial) {
+ CompressedRowSparseMatrix::RandomMatrixOptions options;
+ options.num_col_blocks = num_blocks;
+ options.min_col_block_size = kMinBlockSize;
+ options.max_col_block_size = kMaxBlockSize;
+ options.num_row_blocks = 2 * num_blocks;
+ options.min_row_block_size = kMinBlockSize;
+ options.max_row_block_size = kMaxBlockSize;
+ options.block_density = uniform(prng);
+ options.storage_type =
+ CompressedRowSparseMatrix::StorageType::UNSYMMETRIC;
+ auto matrix =
+ CompressedRowSparseMatrix::CreateRandomMatrix(options, prng);
+ const int num_rows = matrix->num_rows();
+ const int num_cols = matrix->num_cols();
+
+ Vector x(num_cols);
+ x.setRandom();
+
+ Vector actual_y(num_rows);
+ actual_y.setZero();
+ matrix->RightMultiplyAndAccumulate(
+ x.data(), actual_y.data(), &context_, kNumThreads);
+
+ Matrix dense;
+ matrix->ToDenseMatrix(&dense);
+ Vector expected_y = dense * x;
+
+ ASSERT_NEAR((expected_y - actual_y).norm() / actual_y.norm(),
+ 0.0,
+ std::numeric_limits<double>::epsilon() * 10)
+ << "\n"
+ << dense << "x:\n"
+ << x.transpose() << "\n"
+ << "expected: \n"
+ << expected_y.transpose() << "\n"
+ << "actual: \n"
+ << actual_y.transpose();
+ }
+ }
+}
+INSTANTIATE_TEST_SUITE_P(ParallelProducts,
+ CompressedRowSparseMatrixParallelTest,
+ ::testing::Values(1, 2, 4, 8),
+ ::testing::PrintToStringParamName());
+
// TODO(sameeragarwal) Add tests for the random matrix creation methods.
} // namespace ceres::internal
diff --git a/internal/ceres/fake_bundle_adjustment_jacobian.cc b/internal/ceres/fake_bundle_adjustment_jacobian.cc
index 9e7d82b..dd2aee8 100644
--- a/internal/ceres/fake_bundle_adjustment_jacobian.cc
+++ b/internal/ceres/fake_bundle_adjustment_jacobian.cc
@@ -96,4 +96,23 @@
return jacobian;
}
+std::pair<
+ std::unique_ptr<PartitionedMatrixView<2, Eigen::Dynamic, Eigen::Dynamic>>,
+ std::unique_ptr<BlockSparseMatrix>>
+CreateFakeBundleAdjustmentPartitionedJacobian(int num_cameras,
+ int num_points,
+ int camera_size,
+ int landmark_size,
+ double visibility,
+ std::mt19937& rng) {
+ using PartitionedView =
+ PartitionedMatrixView<2, Eigen::Dynamic, Eigen::Dynamic>;
+ auto block_sparse_matrix = CreateFakeBundleAdjustmentJacobian(
+ num_cameras, num_points, camera_size, landmark_size, visibility, rng);
+ auto partitioned_view =
+ std::make_unique<PartitionedView>(*block_sparse_matrix, num_points);
+ return std::make_pair(std::move(partitioned_view),
+ std::move(block_sparse_matrix));
+}
+
} // namespace ceres::internal
diff --git a/internal/ceres/fake_bundle_adjustment_jacobian.h b/internal/ceres/fake_bundle_adjustment_jacobian.h
index 9492b52..41b55c8 100644
--- a/internal/ceres/fake_bundle_adjustment_jacobian.h
+++ b/internal/ceres/fake_bundle_adjustment_jacobian.h
@@ -36,6 +36,7 @@
#include <random>
#include "ceres/block_sparse_matrix.h"
+#include "ceres/partitioned_matrix_view.h"
namespace ceres::internal {
std::unique_ptr<BlockSparseMatrix> CreateFakeBundleAdjustmentJacobian(
@@ -46,6 +47,32 @@
double visibility,
std::mt19937& prng);
+template <int kEBlockSize = 3, int kFBlockSize = 6>
+std::pair<std::unique_ptr<PartitionedMatrixView<2, kEBlockSize, kFBlockSize>>,
+ std::unique_ptr<BlockSparseMatrix>>
+CreateFakeBundleAdjustmentPartitionedJacobian(int num_cameras,
+ int num_points,
+ double visibility,
+ std::mt19937& rng) {
+ using PartitionedView = PartitionedMatrixView<2, kEBlockSize, kFBlockSize>;
+ auto block_sparse_matrix = CreateFakeBundleAdjustmentJacobian(
+ num_cameras, num_points, kFBlockSize, kEBlockSize, visibility, rng);
+ auto partitioned_view =
+ std::make_unique<PartitionedView>(*block_sparse_matrix, num_points);
+ return std::make_pair(std::move(partitioned_view),
+ std::move(block_sparse_matrix));
+}
+
+std::pair<
+ std::unique_ptr<PartitionedMatrixView<2, Eigen::Dynamic, Eigen::Dynamic>>,
+ std::unique_ptr<BlockSparseMatrix>>
+CreateFakeBundleAdjustmentPartitionedJacobian(int num_cameras,
+ int num_points,
+ int camera_size,
+ int landmark_size,
+ double visibility,
+ std::mt19937& rng);
+
} // namespace ceres::internal
#endif // CERES_INTERNAL_FAKE_BUNDLE_ADJUSTMENT_JACOBIAN
diff --git a/internal/ceres/implicit_schur_complement.cc b/internal/ceres/implicit_schur_complement.cc
index 9b7ee47..bbd5e4b 100644
--- a/internal/ceres/implicit_schur_complement.cc
+++ b/internal/ceres/implicit_schur_complement.cc
@@ -105,7 +105,8 @@
double* y) const {
// y1 = F x
tmp_rows_.setZero();
- A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());
+ A_->RightMultiplyAndAccumulateF(
+ x, tmp_rows_.data(), options_.context, options_.num_threads);
// y2 = E' y1
tmp_e_cols_.setZero();
@@ -114,11 +115,16 @@
// y3 = -(E'E)^-1 y2
tmp_e_cols_2_.setZero();
block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
- tmp_e_cols_2_.data());
+ tmp_e_cols_2_.data(),
+ options_.context,
+ options_.num_threads);
tmp_e_cols_2_ *= -1.0;
// y1 = y1 + E y3
- A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data());
+ A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(),
+ tmp_rows_.data(),
+ options_.context,
+ options_.num_threads);
// y5 = D * x
if (D_ != nullptr) {
@@ -139,7 +145,8 @@
CHECK(compute_ftf_inverse_);
// y1 = F x
tmp_rows_.setZero();
- A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());
+ A_->RightMultiplyAndAccumulateF(
+ x, tmp_rows_.data(), options_.context, options_.num_threads);
// y2 = E' y1
tmp_e_cols_.setZero();
@@ -148,18 +155,23 @@
// y3 = (E'E)^-1 y2
tmp_e_cols_2_.setZero();
block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
- tmp_e_cols_2_.data());
+ tmp_e_cols_2_.data(),
+ options_.context,
+ options_.num_threads);
// y1 = E y3
tmp_rows_.setZero();
- A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data());
+ A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(),
+ tmp_rows_.data(),
+ options_.context,
+ options_.num_threads);
// y4 = F' y1
tmp_f_cols_.setZero();
A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), tmp_f_cols_.data());
// y += (F'F)^-1 y4
- block_diagonal_FtF_inverse_->RightMultiplyAndAccumulate(tmp_f_cols_.data(),
- y);
+ block_diagonal_FtF_inverse_->RightMultiplyAndAccumulate(
+ tmp_f_cols_.data(), y, options_.context, options_.num_threads);
}
// Given a block diagonal matrix and an optional array of diagonal
@@ -197,7 +209,8 @@
// y1 = F x
tmp_rows_.setZero();
- A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());
+ A_->RightMultiplyAndAccumulateF(
+ x, tmp_rows_.data(), options_.context, options_.num_threads);
// y2 = b - y1
tmp_rows_ = ConstVectorRef(b_, num_rows) - tmp_rows_;
@@ -208,8 +221,8 @@
// y = (E'E)^-1 y3
VectorRef(y, num_cols).setZero();
- block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
- y);
+ block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(
+ tmp_e_cols_.data(), y, options_.context, options_.num_threads);
// The full solution vector y has two blocks. The first block of
// variables corresponds to the eliminated variables, which we just
@@ -232,12 +245,13 @@
// y2 = (E'E)^-1 y1
Vector y2 = Vector::Zero(A_->num_cols_e());
- block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
- y2.data());
+ block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(
+ tmp_e_cols_.data(), y2.data(), options_.context, options_.num_threads);
// y3 = E y2
tmp_rows_.setZero();
- A_->RightMultiplyAndAccumulateE(y2.data(), tmp_rows_.data());
+ A_->RightMultiplyAndAccumulateE(
+ y2.data(), tmp_rows_.data(), options_.context, options_.num_threads);
// y3 = b - y3
tmp_rows_ = ConstVectorRef(b_, A_->num_rows()) - tmp_rows_;
diff --git a/internal/ceres/linear_operator.cc b/internal/ceres/linear_operator.cc
index 597a37e..1b518a6 100644
--- a/internal/ceres/linear_operator.cc
+++ b/internal/ceres/linear_operator.cc
@@ -30,8 +30,34 @@
#include "ceres/linear_operator.h"
+#include <glog/logging.h>
+
namespace ceres::internal {
+void LinearOperator::RightMultiplyAndAccumulate(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const {
+ (void)context;
+ if (num_threads != 1) {
+ VLOG(3) << "Parallel right product is not supported by linear operator "
+ "implementation";
+ }
+ RightMultiplyAndAccumulate(x, y);
+}
+
+void LinearOperator::LeftMultiplyAndAccumulate(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const {
+ (void)context;
+ if (num_threads != 1) {
+ VLOG(3) << "Parallel left product is not supported by linear operator "
+ "implementation";
+ }
+ LeftMultiplyAndAccumulate(x, y);
+}
+
LinearOperator::~LinearOperator() = default;
} // namespace ceres::internal
diff --git a/internal/ceres/linear_operator.h b/internal/ceres/linear_operator.h
index 8a1e902..76beb40 100644
--- a/internal/ceres/linear_operator.h
+++ b/internal/ceres/linear_operator.h
@@ -39,6 +39,8 @@
namespace ceres::internal {
+class ContextImpl;
+
// This is an abstract base class for linear operators. It supports
// access to size information and left and right multiply operators.
class CERES_NO_EXPORT LinearOperator {
@@ -47,8 +49,16 @@
// y = y + Ax;
virtual void RightMultiplyAndAccumulate(const double* x, double* y) const = 0;
+ virtual void RightMultiplyAndAccumulate(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const;
// y = y + A'x;
virtual void LeftMultiplyAndAccumulate(const double* x, double* y) const = 0;
+ virtual void LeftMultiplyAndAccumulate(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const;
virtual void RightMultiplyAndAccumulate(const Vector& x, Vector& y) const {
RightMultiplyAndAccumulate(x.data(), y.data());
@@ -58,6 +68,20 @@
LeftMultiplyAndAccumulate(x.data(), y.data());
}
+ virtual void RightMultiplyAndAccumulate(const Vector& x,
+ Vector& y,
+ ContextImpl* context,
+ int num_threads) const {
+ RightMultiplyAndAccumulate(x.data(), y.data(), context, num_threads);
+ }
+
+ virtual void LeftMultiplyAndAccumulate(const Vector& x,
+ Vector& y,
+ ContextImpl* context,
+ int num_threads) const {
+ LeftMultiplyAndAccumulate(x.data(), y.data(), context, num_threads);
+ }
+
virtual int num_rows() const = 0;
virtual int num_cols() const = 0;
};
diff --git a/internal/ceres/partitioned_matrix_view.h b/internal/ceres/partitioned_matrix_view.h
index 7fb1a09..86106da 100644
--- a/internal/ceres/partitioned_matrix_view.h
+++ b/internal/ceres/partitioned_matrix_view.h
@@ -52,6 +52,8 @@
namespace ceres::internal {
+class ContextImpl;
+
// Given generalized bi-partite matrix A = [E F], with the same block
// structure as required by the Schur complement based solver, found
// in schur_complement_solver.h, provide access to the
@@ -75,10 +77,18 @@
// y += Ex
virtual void RightMultiplyAndAccumulateE(const double* x,
double* y) const = 0;
+ virtual void RightMultiplyAndAccumulateE(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const = 0;
// y += Fx
virtual void RightMultiplyAndAccumulateF(const double* x,
double* y) const = 0;
+ virtual void RightMultiplyAndAccumulateF(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const = 0;
// Create and return the block diagonal of the matrix E'E.
virtual std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalEtE() const = 0;
@@ -130,6 +140,14 @@
void LeftMultiplyAndAccumulateF(const double* x, double* y) const final;
void RightMultiplyAndAccumulateE(const double* x, double* y) const final;
void RightMultiplyAndAccumulateF(const double* x, double* y) const final;
+ void RightMultiplyAndAccumulateE(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const final;
+ void RightMultiplyAndAccumulateF(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const final;
std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalEtE() const final;
std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalFtF() const final;
void UpdateBlockDiagonalEtE(BlockSparseMatrix* block_diagonal) const final;
diff --git a/internal/ceres/partitioned_matrix_view_benchmark.cc b/internal/ceres/partitioned_matrix_view_benchmark.cc
new file mode 100644
index 0000000..63d0e34
--- /dev/null
+++ b/internal/ceres/partitioned_matrix_view_benchmark.cc
@@ -0,0 +1,147 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+#include <memory>
+#include <random>
+
+#include "benchmark/benchmark.h"
+#include "ceres/context_impl.h"
+#include "ceres/fake_bundle_adjustment_jacobian.h"
+#include "ceres/partitioned_matrix_view.h"
+
+constexpr int kNumCameras = 1000;
+constexpr int kNumPoints = 10000;
+constexpr int kCameraSize = 6;
+constexpr int kPointSize = 3;
+constexpr double kVisibility = 0.1;
+
+namespace ceres::internal {
+
+static void BM_PatitionedViewRightMultiplyAndAccumulateE_Static(
+ benchmark::State& state) {
+ const int num_threads = state.range(0);
+ std::mt19937 prng;
+ auto [partitioned_view, jacobian] =
+ CreateFakeBundleAdjustmentPartitionedJacobian<kPointSize, kCameraSize>(
+ kNumCameras, kNumPoints, kVisibility, prng);
+
+ ContextImpl context;
+ context.EnsureMinimumThreads(num_threads);
+
+ Vector x(partitioned_view->num_cols_e());
+ Vector y(partitioned_view->num_rows());
+ x.setRandom();
+ y.setRandom();
+ double sum = 0;
+ for (auto _ : state) {
+ partitioned_view->RightMultiplyAndAccumulateE(
+ x.data(), y.data(), &context, num_threads);
+ sum += y.norm();
+ }
+ CHECK_NE(sum, 0.0);
+}
+BENCHMARK(BM_PatitionedViewRightMultiplyAndAccumulateE_Static)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
+
+static void BM_PatitionedViewRightMultiplyAndAccumulateE_Dynamic(
+ benchmark::State& state) {
+ std::mt19937 prng;
+ auto [partitioned_view, jacobian] =
+ CreateFakeBundleAdjustmentPartitionedJacobian(
+ kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
+
+ Vector x(partitioned_view->num_cols_e());
+ Vector y(partitioned_view->num_rows());
+ x.setRandom();
+ y.setRandom();
+ double sum = 0;
+ for (auto _ : state) {
+ partitioned_view->RightMultiplyAndAccumulateE(x.data(), y.data());
+ sum += y.norm();
+ }
+ CHECK_NE(sum, 0.0);
+}
+BENCHMARK(BM_PatitionedViewRightMultiplyAndAccumulateE_Dynamic);
+
+static void BM_PatitionedViewRightMultiplyAndAccumulateF_Static(
+ benchmark::State& state) {
+ const int num_threads = state.range(0);
+ std::mt19937 prng;
+ auto [partitioned_view, jacobian] =
+ CreateFakeBundleAdjustmentPartitionedJacobian<kPointSize, kCameraSize>(
+ kNumCameras, kNumPoints, kVisibility, prng);
+
+ ContextImpl context;
+ context.EnsureMinimumThreads(num_threads);
+
+ Vector x(partitioned_view->num_cols_f());
+ Vector y(partitioned_view->num_rows());
+ x.setRandom();
+ y.setRandom();
+ double sum = 0;
+ for (auto _ : state) {
+ partitioned_view->RightMultiplyAndAccumulateF(
+ x.data(), y.data(), &context, num_threads);
+ sum += y.norm();
+ }
+ CHECK_NE(sum, 0.0);
+}
+BENCHMARK(BM_PatitionedViewRightMultiplyAndAccumulateF_Static)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
+
+static void BM_PatitionedViewRightMultiplyAndAccumulateF_Dynamic(
+ benchmark::State& state) {
+ std::mt19937 prng;
+ auto [partitioned_view, jacobian] =
+ CreateFakeBundleAdjustmentPartitionedJacobian(
+ kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
+
+ Vector x(partitioned_view->num_cols_f());
+ Vector y(partitioned_view->num_rows());
+ x.setRandom();
+ y.setRandom();
+ double sum = 0;
+ for (auto _ : state) {
+ partitioned_view->RightMultiplyAndAccumulateF(x.data(), y.data());
+ sum += y.norm();
+ }
+ CHECK_NE(sum, 0.0);
+}
+BENCHMARK(BM_PatitionedViewRightMultiplyAndAccumulateF_Dynamic);
+
+} // namespace ceres::internal
+
+BENCHMARK_MAIN();
diff --git a/internal/ceres/partitioned_matrix_view_impl.h b/internal/ceres/partitioned_matrix_view_impl.h
index 2150660..fea0a7d 100644
--- a/internal/ceres/partitioned_matrix_view_impl.h
+++ b/internal/ceres/partitioned_matrix_view_impl.h
@@ -36,6 +36,7 @@
#include "ceres/block_sparse_matrix.h"
#include "ceres/block_structure.h"
#include "ceres/internal/eigen.h"
+#include "ceres/parallel_for.h"
#include "ceres/partitioned_matrix_view.h"
#include "ceres/small_blas.h"
#include "glog/logging.h"
@@ -88,71 +89,100 @@
template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
RightMultiplyAndAccumulateE(const double* x, double* y) const {
- const CompressedRowBlockStructure* bs = matrix_.block_structure();
+ RightMultiplyAndAccumulateE(x, y, nullptr, 1);
+}
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+ RightMultiplyAndAccumulateE(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const {
// Iterate over the first num_row_blocks_e_ row blocks, and multiply
// by the first cell in each row block.
+ auto bs = matrix_.block_structure();
const double* values = matrix_.values();
- for (int r = 0; r < num_row_blocks_e_; ++r) {
- const Cell& cell = bs->rows[r].cells[0];
- const int row_block_pos = bs->rows[r].block.position;
- const int row_block_size = bs->rows[r].block.size;
- const int col_block_id = cell.block_id;
- const int col_block_pos = bs->cols[col_block_id].position;
- const int col_block_size = bs->cols[col_block_id].size;
- // clang-format off
- MatrixVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
- values + cell.position, row_block_size, col_block_size,
- x + col_block_pos,
- y + row_block_pos);
- // clang-format on
- }
+ ParallelFor(context,
+ 0,
+ num_row_blocks_e_,
+ num_threads,
+ [values, bs, x, y](int row_block_id) {
+ const Cell& cell = bs->rows[row_block_id].cells[0];
+ const int row_block_pos = bs->rows[row_block_id].block.position;
+ const int row_block_size = bs->rows[row_block_id].block.size;
+ const int col_block_id = cell.block_id;
+ const int col_block_pos = bs->cols[col_block_id].position;
+ const int col_block_size = bs->cols[col_block_id].size;
+ // clang-format off
+ MatrixVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
+ values + cell.position, row_block_size, col_block_size,
+ x + col_block_pos,
+ y + row_block_pos);
+ // clang-format on
+ });
}
template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
RightMultiplyAndAccumulateF(const double* x, double* y) const {
- const CompressedRowBlockStructure* bs = matrix_.block_structure();
+ RightMultiplyAndAccumulateF(x, y, nullptr, 1);
+}
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+ RightMultiplyAndAccumulateF(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const {
// Iterate over row blocks, and if the row block is in E, then
// multiply by all the cells except the first one which is of type
// E. If the row block is not in E (i.e its in the bottom
// num_row_blocks - num_row_blocks_e row blocks), then all the cells
// are of type F and multiply by them all.
+ const CompressedRowBlockStructure* bs = matrix_.block_structure();
+ const int num_row_blocks = bs->rows.size();
+ const int num_cols_e = num_cols_e_;
const double* values = matrix_.values();
- for (int r = 0; r < num_row_blocks_e_; ++r) {
- const int row_block_pos = bs->rows[r].block.position;
- const int row_block_size = bs->rows[r].block.size;
- const std::vector<Cell>& cells = bs->rows[r].cells;
- for (int c = 1; c < cells.size(); ++c) {
- const int col_block_id = cells[c].block_id;
- const int col_block_pos = bs->cols[col_block_id].position;
- const int col_block_size = bs->cols[col_block_id].size;
- // clang-format off
- MatrixVectorMultiply<kRowBlockSize, kFBlockSize, 1>(
- values + cells[c].position, row_block_size, col_block_size,
- x + col_block_pos - num_cols_e_,
- y + row_block_pos);
- // clang-format on
- }
- }
-
- for (int r = num_row_blocks_e_; r < bs->rows.size(); ++r) {
- const int row_block_pos = bs->rows[r].block.position;
- const int row_block_size = bs->rows[r].block.size;
- const std::vector<Cell>& cells = bs->rows[r].cells;
- for (const auto& cell : cells) {
- const int col_block_id = cell.block_id;
- const int col_block_pos = bs->cols[col_block_id].position;
- const int col_block_size = bs->cols[col_block_id].size;
- // clang-format off
- MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
- values + cell.position, row_block_size, col_block_size,
- x + col_block_pos - num_cols_e_,
- y + row_block_pos);
- // clang-format on
- }
- }
+ ParallelFor(context,
+ 0,
+ num_row_blocks_e_,
+ num_threads,
+ [values, bs, num_cols_e, x, y](int row_block_id) {
+ const int row_block_pos = bs->rows[row_block_id].block.position;
+ const int row_block_size = bs->rows[row_block_id].block.size;
+ const auto& cells = bs->rows[row_block_id].cells;
+ for (int c = 1; c < cells.size(); ++c) {
+ const int col_block_id = cells[c].block_id;
+ const int col_block_pos = bs->cols[col_block_id].position;
+ const int col_block_size = bs->cols[col_block_id].size;
+ // clang-format off
+ MatrixVectorMultiply<kRowBlockSize, kFBlockSize, 1>(
+ values + cells[c].position, row_block_size, col_block_size,
+ x + col_block_pos - num_cols_e,
+ y + row_block_pos);
+ // clang-format on
+ }
+ });
+ ParallelFor(context,
+ num_row_blocks_e_,
+ num_row_blocks,
+ num_threads,
+ [values, bs, num_cols_e, x, y](int row_block_id) {
+ const int row_block_pos = bs->rows[row_block_id].block.position;
+ const int row_block_size = bs->rows[row_block_id].block.size;
+ const auto& cells = bs->rows[row_block_id].cells;
+ for (const auto& cell : cells) {
+ const int col_block_id = cell.block_id;
+ const int col_block_pos = bs->cols[col_block_id].position;
+ const int col_block_size = bs->cols[col_block_id].size;
+ // clang-format off
+ MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+ values + cell.position, row_block_size, col_block_size,
+ x + col_block_pos - num_cols_e,
+ y + row_block_pos);
+ // clang-format on
+ }
+ });
}
template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
diff --git a/internal/ceres/partitioned_matrix_view_test.cc b/internal/ceres/partitioned_matrix_view_test.cc
index e4484dd..b11ea8e 100644
--- a/internal/ceres/partitioned_matrix_view_test.cc
+++ b/internal/ceres/partitioned_matrix_view_test.cc
@@ -174,5 +174,64 @@
EXPECT_NEAR(block_diagonal_ff->values()[2], 37.0, kEpsilon);
}
+const int kMaxNumThreads = 8;
+class PartitionedMatrixViewParallelTest : public ::testing::TestWithParam<int> {
+ protected:
+ void SetUp() final {
+ std::unique_ptr<LinearLeastSquaresProblem> problem =
+ CreateLinearLeastSquaresProblemFromId(2);
+ CHECK(problem != nullptr);
+ A_ = std::move(problem->A);
+
+ num_cols_ = A_->num_cols();
+ num_rows_ = A_->num_rows();
+ num_eliminate_blocks_ = problem->num_eliminate_blocks;
+ LinearSolver::Options options;
+ options.elimination_groups.push_back(num_eliminate_blocks_);
+ pmv_ = PartitionedMatrixViewBase::Create(
+ options, *down_cast<BlockSparseMatrix*>(A_.get()));
+ context_.EnsureMinimumThreads(kMaxNumThreads);
+ }
+
+ double RandDouble() { return distribution_(prng_); }
+
+ ContextImpl context_;
+ int num_rows_;
+ int num_cols_;
+ int num_eliminate_blocks_;
+ std::unique_ptr<SparseMatrix> A_;
+ std::unique_ptr<PartitionedMatrixViewBase> pmv_;
+ std::mt19937 prng_;
+ std::uniform_real_distribution<double> distribution_ =
+ std::uniform_real_distribution<double>(0.0, 1.0);
+};
+
+TEST_P(PartitionedMatrixViewParallelTest, RightMultiplyAndAccumulateEParallel) {
+ const int kNumThreads = GetParam();
+ Vector x1(pmv_->num_cols_e());
+ Vector x2(pmv_->num_cols());
+ x2.setZero();
+
+ for (int i = 0; i < pmv_->num_cols_e(); ++i) {
+ x1(i) = x2(i) = RandDouble();
+ }
+
+ Vector y1 = Vector::Zero(pmv_->num_rows());
+ pmv_->RightMultiplyAndAccumulateE(
+ x1.data(), y1.data(), &context_, kNumThreads);
+
+ Vector y2 = Vector::Zero(pmv_->num_rows());
+ A_->RightMultiplyAndAccumulate(x2.data(), y2.data());
+
+ for (int i = 0; i < pmv_->num_rows(); ++i) {
+ EXPECT_NEAR(y1(i), y2(i), kEpsilon);
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(ParallelProducts,
+ PartitionedMatrixViewParallelTest,
+ ::testing::Values(1, 2, 4, 8),
+ ::testing::PrintToStringParamName());
+
} // namespace internal
} // namespace ceres
diff --git a/internal/ceres/power_series_expansion_preconditioner_test.cc b/internal/ceres/power_series_expansion_preconditioner_test.cc
index a291405..233d4f7 100644
--- a/internal/ceres/power_series_expansion_preconditioner_test.cc
+++ b/internal/ceres/power_series_expansion_preconditioner_test.cc
@@ -45,10 +45,9 @@
const auto A = down_cast<BlockSparseMatrix*>(problem_->A.get());
const auto D = problem_->D.get();
- LinearSolver::Options options;
- options.elimination_groups.push_back(problem_->num_eliminate_blocks);
- options.preconditioner_type = SCHUR_POWER_SERIES_EXPANSION;
- isc_ = std::make_unique<ImplicitSchurComplement>(options);
+ options_.elimination_groups.push_back(problem_->num_eliminate_blocks);
+ options_.preconditioner_type = SCHUR_POWER_SERIES_EXPANSION;
+ isc_ = std::make_unique<ImplicitSchurComplement>(options_);
isc_->Init(*A, D, problem_->b.get());
num_f_cols_ = isc_->rhs().rows();
const int num_rows = A->num_rows(), num_cols = A->num_cols(),
@@ -76,6 +75,7 @@
std::unique_ptr<ImplicitSchurComplement> isc_;
int num_f_cols_;
Matrix sc_inverse_expected_;
+ LinearSolver::Options options_;
};
TEST_F(PowerSeriesExpansionPreconditionerTest,
@@ -168,4 +168,4 @@
}
}
-} // namespace ceres::internal
\ No newline at end of file
+} // namespace ceres::internal
diff --git a/internal/ceres/sparse_matrix.h b/internal/ceres/sparse_matrix.h
index da6af18..e665c8b 100644
--- a/internal/ceres/sparse_matrix.h
+++ b/internal/ceres/sparse_matrix.h
@@ -41,6 +41,7 @@
#include "ceres/types.h"
namespace ceres::internal {
+struct ContextImpl;
// This class defines the interface for storing and manipulating
// sparse matrices. The key property that differentiates different
diff --git a/internal/ceres/spmv_benchmark.cc b/internal/ceres/spmv_benchmark.cc
index 0bd1e43..69df1ea 100644
--- a/internal/ceres/spmv_benchmark.cc
+++ b/internal/ceres/spmv_benchmark.cc
@@ -66,26 +66,37 @@
static void BM_BlockSparseRightMultiplyAndAccumulateBA(
benchmark::State& state) {
+ const int num_threads = state.range(0);
std::mt19937 prng;
auto jacobian = CreateFakeBundleAdjustmentJacobian(
kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
+ ContextImpl context;
+ context.EnsureMinimumThreads(num_threads);
+
Vector x(jacobian->num_cols());
Vector y(jacobian->num_rows());
x.setRandom();
y.setRandom();
double sum = 0;
for (auto _ : state) {
- jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
+ jacobian->RightMultiplyAndAccumulate(
+ x.data(), y.data(), &context, num_threads);
sum += y.norm();
}
CHECK_NE(sum, 0.0);
}
-BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateBA);
+BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateBA)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
static void BM_BlockSparseRightMultiplyAndAccumulateUnstructured(
benchmark::State& state) {
+ const int num_threads = state.range(0);
BlockSparseMatrix::RandomMatrixOptions options;
options.num_row_blocks = kNumRowBlocks;
options.num_col_blocks = kNumColBlocks;
@@ -98,19 +109,28 @@
auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
+ ContextImpl context;
+ context.EnsureMinimumThreads(num_threads);
+
Vector x(jacobian->num_cols());
Vector y(jacobian->num_rows());
x.setRandom();
y.setRandom();
double sum = 0;
for (auto _ : state) {
- jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
+ jacobian->RightMultiplyAndAccumulate(
+ x.data(), y.data(), &context, num_threads);
sum += y.norm();
}
CHECK_NE(sum, 0.0);
}
-BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateUnstructured);
+BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateUnstructured)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
static void BM_BlockSparseLeftMultiplyAndAccumulateBA(benchmark::State& state) {
std::mt19937 prng;
@@ -158,6 +178,7 @@
BENCHMARK(BM_BlockSparseLeftMultiplyAndAccumulateUnstructured);
static void BM_CRSRightMultiplyAndAccumulateBA(benchmark::State& state) {
+ const int num_threads = state.range(0);
std::mt19937 prng;
auto bsm_jacobian = CreateFakeBundleAdjustmentJacobian(
kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
@@ -167,22 +188,32 @@
bsm_jacobian->num_nonzeros());
bsm_jacobian->ToCompressedRowSparseMatrix(&jacobian);
+ ContextImpl context;
+ context.EnsureMinimumThreads(num_threads);
+
Vector x(jacobian.num_cols());
Vector y(jacobian.num_rows());
x.setRandom();
y.setRandom();
double sum = 0;
for (auto _ : state) {
- jacobian.RightMultiplyAndAccumulate(x.data(), y.data());
+ jacobian.RightMultiplyAndAccumulate(
+ x.data(), y.data(), &context, num_threads);
sum += y.norm();
}
CHECK_NE(sum, 0.0);
}
-BENCHMARK(BM_CRSRightMultiplyAndAccumulateBA);
+BENCHMARK(BM_CRSRightMultiplyAndAccumulateBA)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
static void BM_CRSRightMultiplyAndAccumulateUnstructured(
benchmark::State& state) {
+ const int num_threads = state.range(0);
BlockSparseMatrix::RandomMatrixOptions options;
options.num_row_blocks = kNumRowBlocks;
options.num_col_blocks = kNumColBlocks;
@@ -199,19 +230,28 @@
bsm_jacobian->num_nonzeros());
bsm_jacobian->ToCompressedRowSparseMatrix(&jacobian);
+ ContextImpl context;
+ context.EnsureMinimumThreads(num_threads);
+
Vector x(jacobian.num_cols());
Vector y(jacobian.num_rows());
x.setRandom();
y.setRandom();
double sum = 0;
for (auto _ : state) {
- jacobian.RightMultiplyAndAccumulate(x.data(), y.data());
+ jacobian.RightMultiplyAndAccumulate(
+ x.data(), y.data(), &context, num_threads);
sum += y.norm();
}
CHECK_NE(sum, 0.0);
}
-BENCHMARK(BM_CRSRightMultiplyAndAccumulateUnstructured);
+BENCHMARK(BM_CRSRightMultiplyAndAccumulateUnstructured)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
static void BM_CRSLeftMultiplyAndAccumulateBA(benchmark::State& state) {
std::mt19937 prng;