Parallel right products for partitioned view Parallel implementations for right-multiply by dense vector for: - Partitioned matrix view - Block-sparse matrix - CRS matrix (non-symmetric only) When coupled with non-interleaving indexes in parallel for, this simple aproach provides a reasonable speedup. For example, in CRS case difference with GPGPU approach reduces closer to memory throughput ratio for high enough core count. ./bin/spmv_benchmark ------------------------------------------------------------------- Benchmark Time ------------------------------------------------------------------- BM_BlockSparseRightMultiplyAndAccumulateBA/1 28.5 ms BM_BlockSparseRightMultiplyAndAccumulateBA/2 15.7 ms BM_BlockSparseRightMultiplyAndAccumulateBA/4 9.01 ms BM_BlockSparseRightMultiplyAndAccumulateBA/8 5.60 ms BM_BlockSparseRightMultiplyAndAccumulateBA/16 3.86 ms BM_BlockSparseRightMultiplyAndAccumulateBA/28 3.84 ms BM_BlockSparseRightMultiplyAndAccumulateUnstructured/1 23.8 ms BM_BlockSparseRightMultiplyAndAccumulateUnstructured/2 15.0 ms BM_BlockSparseRightMultiplyAndAccumulateUnstructured/4 8.01 ms BM_BlockSparseRightMultiplyAndAccumulateUnstructured/8 4.02 ms BM_BlockSparseRightMultiplyAndAccumulateUnstructured/16 2.39 ms BM_BlockSparseRightMultiplyAndAccumulateUnstructured/28 1.68 ms BM_BlockSparseLeftMultiplyAndAccumulateBA 30.7 ms BM_BlockSparseLeftMultiplyAndAccumulateUnstructured 41.5 ms BM_CRSRightMultiplyAndAccumulateBA/1 24.1 ms BM_CRSRightMultiplyAndAccumulateBA/2 13.6 ms BM_CRSRightMultiplyAndAccumulateBA/4 8.70 ms BM_CRSRightMultiplyAndAccumulateBA/8 5.34 ms BM_CRSRightMultiplyAndAccumulateBA/16 3.99 ms BM_CRSRightMultiplyAndAccumulateBA/28 4.00 ms BM_CRSRightMultiplyAndAccumulateUnstructured/1 21.1 ms BM_CRSRightMultiplyAndAccumulateUnstructured/2 10.83 ms BM_CRSRightMultiplyAndAccumulateUnstructured/4 5.88 ms BM_CRSRightMultiplyAndAccumulateUnstructured/8 3.68 ms BM_CRSRightMultiplyAndAccumulateUnstructured/16 2.21 ms BM_CRSRightMultiplyAndAccumulateUnstructured/28 1.71 ms BM_CRSLeftMultiplyAndAccumulateBA 23.6 ms BM_CRSLeftMultiplyAndAccumulateUnstructured 22.5 ms BM_CudaRightMultiplyAndAccumulateBA 0.679 ms BM_CudaRightMultiplyAndAccumulateUnstructured 0.480 ms BM_CudaLeftMultiplyAndAccumulateBA 0.774 ms BM_CudaLeftMultiplyAndAccumulateUnstructured 0.361 ms ./bin/partitioned_matrix_view_benchmark ----------------------------------------------------------------- Benchmark Time ----------------------------------------------------------------- BM_PatitionedViewRightMultiplyAndAccumulateE_Static/1 18.5 ms BM_PatitionedViewRightMultiplyAndAccumulateE_Static/2 10.7 ms BM_PatitionedViewRightMultiplyAndAccumulateE_Static/4 6.34 ms BM_PatitionedViewRightMultiplyAndAccumulateE_Static/8 4.26 ms BM_PatitionedViewRightMultiplyAndAccumulateE_Static/16 3.86 ms BM_PatitionedViewRightMultiplyAndAccumulateE_Static/28 3.75 ms BM_PatitionedViewRightMultiplyAndAccumulateF_Static/1 18.8 ms BM_PatitionedViewRightMultiplyAndAccumulateF_Static/2 11.9 ms BM_PatitionedViewRightMultiplyAndAccumulateF_Static/4 6.94 ms BM_PatitionedViewRightMultiplyAndAccumulateF_Static/8 4.41 ms BM_PatitionedViewRightMultiplyAndAccumulateF_Static/16 3.63 ms BM_PatitionedViewRightMultiplyAndAccumulateF_Static/28 3.86 ms Timings correspond to intel 8176 cpu and 2080ti nvidia gpu, with OpenMP threading backend. Change-Id: Idc07d0563103d057ca3c8412de81a7823fe232af
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt index fef1c2d..350b3b5 100644 --- a/internal/ceres/CMakeLists.txt +++ b/internal/ceres/CMakeLists.txt
@@ -598,6 +598,9 @@ add_executable(spmv_benchmark spmv_benchmark.cc) add_dependencies_to_benchmark(spmv_benchmark) + add_executable(partitioned_matrix_view_benchmark partitioned_matrix_view_benchmark.cc) + add_dependencies_to_benchmark(partitioned_matrix_view_benchmark) + add_executable(block_jacobi_preconditioner_benchmark block_jacobi_preconditioner_benchmark.cc) add_dependencies_to_benchmark(block_jacobi_preconditioner_benchmark)
diff --git a/internal/ceres/block_sparse_matrix.cc b/internal/ceres/block_sparse_matrix.cc index 0401696..0dd3b20 100644 --- a/internal/ceres/block_sparse_matrix.cc +++ b/internal/ceres/block_sparse_matrix.cc
@@ -39,6 +39,7 @@ #include "ceres/block_structure.h" #include "ceres/crs_matrix.h" #include "ceres/internal/eigen.h" +#include "ceres/parallel_for.h" #include "ceres/small_blas.h" #include "ceres/triplet_sparse_matrix.h" #include "glog/logging.h" @@ -88,25 +89,44 @@ void BlockSparseMatrix::RightMultiplyAndAccumulate(const double* x, double* y) const { + RightMultiplyAndAccumulate(x, y, nullptr, 1); +} + +void BlockSparseMatrix::RightMultiplyAndAccumulate(const double* x, + double* y, + ContextImpl* context, + int num_threads) const { CHECK(x != nullptr); CHECK(y != nullptr); - for (int i = 0; i < block_structure_->rows.size(); ++i) { - int row_block_pos = block_structure_->rows[i].block.position; - int row_block_size = block_structure_->rows[i].block.size; - const std::vector<Cell>& cells = block_structure_->rows[i].cells; - for (const auto& cell : cells) { - int col_block_id = cell.block_id; - int col_block_size = block_structure_->cols[col_block_id].size; - int col_block_pos = block_structure_->cols[col_block_id].position; - MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>( - values_.get() + cell.position, - row_block_size, - col_block_size, - x + col_block_pos, - y + row_block_pos); - } - } + const auto values = values_.get(); + const auto block_structure = block_structure_.get(); + const auto num_row_blocks = block_structure->rows.size(); + + ParallelFor(context, + 0, + num_row_blocks, + num_threads, + [values, block_structure, x, y](int row_block_id) { + const int row_block_pos = + block_structure->rows[row_block_id].block.position; + const int row_block_size = + block_structure->rows[row_block_id].block.size; + const auto& cells = block_structure->rows[row_block_id].cells; + for (const auto& cell : cells) { + const int col_block_id = cell.block_id; + const int col_block_size = + block_structure->cols[col_block_id].size; + const int col_block_pos = + block_structure->cols[col_block_id].position; + MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>( + values + cell.position, + row_block_size, + col_block_size, + x + col_block_pos, + y + row_block_pos); + } + }); } void BlockSparseMatrix::LeftMultiplyAndAccumulate(const double* x,
diff --git a/internal/ceres/block_sparse_matrix.h b/internal/ceres/block_sparse_matrix.h index b7ea1df..3a8dac9 100644 --- a/internal/ceres/block_sparse_matrix.h +++ b/internal/ceres/block_sparse_matrix.h
@@ -39,6 +39,7 @@ #include "ceres/block_structure.h" #include "ceres/compressed_row_sparse_matrix.h" +#include "ceres/context_impl.h" #include "ceres/internal/disable_warnings.h" #include "ceres/internal/eigen.h" #include "ceres/internal/export.h" @@ -66,13 +67,16 @@ // CompressedRowBlockStructure objects. explicit BlockSparseMatrix(CompressedRowBlockStructure* block_structure); - BlockSparseMatrix(); BlockSparseMatrix(const BlockSparseMatrix&) = delete; void operator=(const BlockSparseMatrix&) = delete; // Implementation of SparseMatrix interface. void SetZero() final; void RightMultiplyAndAccumulate(const double* x, double* y) const final; + void RightMultiplyAndAccumulate(const double* x, + double* y, + ContextImpl* context, + int num_threads) const final; void LeftMultiplyAndAccumulate(const double* x, double* y) const final; void SquaredColumnNorm(double* x) const final; void ScaleColumns(const double* scale) final;
diff --git a/internal/ceres/block_sparse_matrix_test.cc b/internal/ceres/block_sparse_matrix_test.cc index 7d7d4c9..3f5c78a 100644 --- a/internal/ceres/block_sparse_matrix_test.cc +++ b/internal/ceres/block_sparse_matrix_test.cc
@@ -115,6 +115,8 @@ } } // namespace +const int kNumThreads = 4; + class BlockSparseMatrixTest : public ::testing::Test { protected: void SetUp() final { @@ -130,10 +132,12 @@ CHECK_EQ(A_->num_rows(), B_->num_rows()); CHECK_EQ(A_->num_cols(), B_->num_cols()); CHECK_EQ(A_->num_nonzeros(), B_->num_nonzeros()); + context_.EnsureMinimumThreads(kNumThreads); } std::unique_ptr<BlockSparseMatrix> A_; std::unique_ptr<TripletSparseMatrix> B_; + ContextImpl context_; }; TEST_F(BlockSparseMatrixTest, SetZeroTest) { @@ -153,6 +157,20 @@ } } +TEST_F(BlockSparseMatrixTest, RightMultiplyAndAccumulateParallelTest) { + Vector y_0 = Vector::Random(A_->num_rows()); + Vector y_s = y_0; + Vector y_p = y_0; + + Vector x = Vector::Random(A_->num_cols()); + A_->RightMultiplyAndAccumulate(x.data(), y_s.data()); + + A_->RightMultiplyAndAccumulate(x.data(), y_p.data(), &context_, kNumThreads); + + // Current parallel implementation is expected to be bit-exact + EXPECT_EQ((y_s - y_p).norm(), 0.); +} + TEST_F(BlockSparseMatrixTest, LeftMultiplyAndAccumulateTest) { Vector y_a = Vector::Zero(A_->num_cols()); Vector y_b = Vector::Zero(A_->num_cols());
diff --git a/internal/ceres/cgnr_solver.cc b/internal/ceres/cgnr_solver.cc index 17a5f62..e1c571d 100644 --- a/internal/ceres/cgnr_solver.cc +++ b/internal/ceres/cgnr_solver.cc
@@ -84,14 +84,21 @@ class CERES_NO_EXPORT CgnrLinearOperator final : public ConjugateGradientsLinearOperator<Vector> { public: - CgnrLinearOperator(const LinearOperator& A, const double* D) - : A_(A), D_(D), z_(Vector::Zero(A.num_rows())) {} + CgnrLinearOperator(const LinearOperator& A, + const double* D, + ContextImpl* context, + int num_threads) + : A_(A), + D_(D), + z_(Vector::Zero(A.num_rows())), + context_(context), + num_threads_(num_threads) {} void RightMultiplyAndAccumulate(const Vector& x, Vector& y) final { // z = Ax // y = y + Atz z_.setZero(); - A_.RightMultiplyAndAccumulate(x, z_); + A_.RightMultiplyAndAccumulate(x, z_, context_, num_threads_); A_.LeftMultiplyAndAccumulate(z_, y); // y = y + DtDx @@ -105,6 +112,9 @@ const LinearOperator& A_; const double* D_; Vector z_; + + ContextImpl* context_; + int num_threads_; }; CgnrSolver::CgnrSolver(LinearSolver::Options options) @@ -166,7 +176,8 @@ cg_options.r_tolerance = per_solve_options.r_tolerance; // lhs = AtA + DtD - CgnrLinearOperator lhs(*A, per_solve_options.D); + CgnrLinearOperator lhs( + *A, per_solve_options.D, options_.context, options_.num_threads); // rhs = Atb. Vector rhs(A->num_cols()); rhs.setZero();
diff --git a/internal/ceres/compressed_row_sparse_matrix.cc b/internal/ceres/compressed_row_sparse_matrix.cc index 057adfd..553dc21 100644 --- a/internal/ceres/compressed_row_sparse_matrix.cc +++ b/internal/ceres/compressed_row_sparse_matrix.cc
@@ -37,8 +37,10 @@ #include <random> #include <vector> +#include "ceres/context_impl.h" #include "ceres/crs_matrix.h" #include "ceres/internal/export.h" +#include "ceres/parallel_for.h" #include "ceres/triplet_sparse_matrix.h" #include "glog/logging.h" @@ -278,19 +280,34 @@ // TODO(sameeragarwal): Make RightMultiplyAndAccumulate and // LeftMultiplyAndAccumulate block-aware for higher performance. +void CompressedRowSparseMatrix::RightMultiplyAndAccumulate( + const double* x, double* y, ContextImpl* context, int num_threads) const { + if (storage_type_ != StorageType::UNSYMMETRIC) { + RightMultiplyAndAccumulate(x, y); + return; + } + + auto values = values_.data(); + auto rows = rows_.data(); + auto cols = cols_.data(); + + ParallelFor( + context, 0, num_rows_, num_threads, [values, rows, cols, x, y](int row) { + for (int idx = rows[row]; idx < rows[row + 1]; ++idx) { + const int c = cols[idx]; + const double v = values[idx]; + y[row] += v * x[c]; + } + }); +} + void CompressedRowSparseMatrix::RightMultiplyAndAccumulate(const double* x, double* y) const { CHECK(x != nullptr); CHECK(y != nullptr); if (storage_type_ == StorageType::UNSYMMETRIC) { - for (int r = 0; r < num_rows_; ++r) { - for (int idx = rows_[r]; idx < rows_[r + 1]; ++idx) { - const int c = cols_[idx]; - const double v = values_[idx]; - y[r] += v * x[c]; - } - } + RightMultiplyAndAccumulate(x, y, nullptr, 1); } else if (storage_type_ == StorageType::UPPER_TRIANGULAR) { // Because of their block structure, we will have entries that lie // above (below) the diagonal for lower (upper) triangular matrices,
diff --git a/internal/ceres/compressed_row_sparse_matrix.h b/internal/ceres/compressed_row_sparse_matrix.h index 8668d06..6fe730e 100644 --- a/internal/ceres/compressed_row_sparse_matrix.h +++ b/internal/ceres/compressed_row_sparse_matrix.h
@@ -48,6 +48,7 @@ namespace internal { +class ContextImpl; class TripletSparseMatrix; class CERES_NO_EXPORT CompressedRowSparseMatrix : public SparseMatrix { @@ -103,6 +104,10 @@ ~CompressedRowSparseMatrix() override; void SetZero() final; void RightMultiplyAndAccumulate(const double* x, double* y) const final; + void RightMultiplyAndAccumulate(const double* x, + double* y, + ContextImpl* context, + int num_threads) const final; void LeftMultiplyAndAccumulate(const double* x, double* y) const final; void SquaredColumnNorm(double* x) const final; void ScaleColumns(const double* scale) final;
diff --git a/internal/ceres/compressed_row_sparse_matrix_test.cc b/internal/ceres/compressed_row_sparse_matrix_test.cc index 82c982b..46136db 100644 --- a/internal/ceres/compressed_row_sparse_matrix_test.cc +++ b/internal/ceres/compressed_row_sparse_matrix_test.cc
@@ -39,6 +39,7 @@ #include "Eigen/SparseCore" #include "ceres/casts.h" +#include "ceres/context_impl.h" #include "ceres/crs_matrix.h" #include "ceres/internal/eigen.h" #include "ceres/linear_least_squares_problems.h" @@ -599,6 +600,73 @@ CompressedRowSparseMatrix::StorageType::UNSYMMETRIC), ParamInfoToString); +const int kMaxNumThreads = 8; +class CompressedRowSparseMatrixParallelTest + : public ::testing::TestWithParam<int> { + void SetUp() final { context_.EnsureMinimumThreads(kMaxNumThreads); } + + protected: + ContextImpl context_; +}; + +TEST_P(CompressedRowSparseMatrixParallelTest, + RightMultiplyAndAccumulateUnsymmetric) { + const int kMinNumBlocks = 1; + const int kMaxNumBlocks = 10; + const int kMinBlockSize = 1; + const int kMaxBlockSize = 5; + const int kNumTrials = 10; + const int kNumThreads = GetParam(); + std::mt19937 prng; + std::uniform_real_distribution<double> uniform(0.5, 1.0); + for (int num_blocks = kMinNumBlocks; num_blocks < kMaxNumBlocks; + ++num_blocks) { + for (int trial = 0; trial < kNumTrials; ++trial) { + CompressedRowSparseMatrix::RandomMatrixOptions options; + options.num_col_blocks = num_blocks; + options.min_col_block_size = kMinBlockSize; + options.max_col_block_size = kMaxBlockSize; + options.num_row_blocks = 2 * num_blocks; + options.min_row_block_size = kMinBlockSize; + options.max_row_block_size = kMaxBlockSize; + options.block_density = uniform(prng); + options.storage_type = + CompressedRowSparseMatrix::StorageType::UNSYMMETRIC; + auto matrix = + CompressedRowSparseMatrix::CreateRandomMatrix(options, prng); + const int num_rows = matrix->num_rows(); + const int num_cols = matrix->num_cols(); + + Vector x(num_cols); + x.setRandom(); + + Vector actual_y(num_rows); + actual_y.setZero(); + matrix->RightMultiplyAndAccumulate( + x.data(), actual_y.data(), &context_, kNumThreads); + + Matrix dense; + matrix->ToDenseMatrix(&dense); + Vector expected_y = dense * x; + + ASSERT_NEAR((expected_y - actual_y).norm() / actual_y.norm(), + 0.0, + std::numeric_limits<double>::epsilon() * 10) + << "\n" + << dense << "x:\n" + << x.transpose() << "\n" + << "expected: \n" + << expected_y.transpose() << "\n" + << "actual: \n" + << actual_y.transpose(); + } + } +} +INSTANTIATE_TEST_SUITE_P(ParallelProducts, + CompressedRowSparseMatrixParallelTest, + ::testing::Values(1, 2, 4, 8), + ::testing::PrintToStringParamName()); + // TODO(sameeragarwal) Add tests for the random matrix creation methods. } // namespace ceres::internal
diff --git a/internal/ceres/fake_bundle_adjustment_jacobian.cc b/internal/ceres/fake_bundle_adjustment_jacobian.cc index 9e7d82b..dd2aee8 100644 --- a/internal/ceres/fake_bundle_adjustment_jacobian.cc +++ b/internal/ceres/fake_bundle_adjustment_jacobian.cc
@@ -96,4 +96,23 @@ return jacobian; } +std::pair< + std::unique_ptr<PartitionedMatrixView<2, Eigen::Dynamic, Eigen::Dynamic>>, + std::unique_ptr<BlockSparseMatrix>> +CreateFakeBundleAdjustmentPartitionedJacobian(int num_cameras, + int num_points, + int camera_size, + int landmark_size, + double visibility, + std::mt19937& rng) { + using PartitionedView = + PartitionedMatrixView<2, Eigen::Dynamic, Eigen::Dynamic>; + auto block_sparse_matrix = CreateFakeBundleAdjustmentJacobian( + num_cameras, num_points, camera_size, landmark_size, visibility, rng); + auto partitioned_view = + std::make_unique<PartitionedView>(*block_sparse_matrix, num_points); + return std::make_pair(std::move(partitioned_view), + std::move(block_sparse_matrix)); +} + } // namespace ceres::internal
diff --git a/internal/ceres/fake_bundle_adjustment_jacobian.h b/internal/ceres/fake_bundle_adjustment_jacobian.h index 9492b52..41b55c8 100644 --- a/internal/ceres/fake_bundle_adjustment_jacobian.h +++ b/internal/ceres/fake_bundle_adjustment_jacobian.h
@@ -36,6 +36,7 @@ #include <random> #include "ceres/block_sparse_matrix.h" +#include "ceres/partitioned_matrix_view.h" namespace ceres::internal { std::unique_ptr<BlockSparseMatrix> CreateFakeBundleAdjustmentJacobian( @@ -46,6 +47,32 @@ double visibility, std::mt19937& prng); +template <int kEBlockSize = 3, int kFBlockSize = 6> +std::pair<std::unique_ptr<PartitionedMatrixView<2, kEBlockSize, kFBlockSize>>, + std::unique_ptr<BlockSparseMatrix>> +CreateFakeBundleAdjustmentPartitionedJacobian(int num_cameras, + int num_points, + double visibility, + std::mt19937& rng) { + using PartitionedView = PartitionedMatrixView<2, kEBlockSize, kFBlockSize>; + auto block_sparse_matrix = CreateFakeBundleAdjustmentJacobian( + num_cameras, num_points, kFBlockSize, kEBlockSize, visibility, rng); + auto partitioned_view = + std::make_unique<PartitionedView>(*block_sparse_matrix, num_points); + return std::make_pair(std::move(partitioned_view), + std::move(block_sparse_matrix)); +} + +std::pair< + std::unique_ptr<PartitionedMatrixView<2, Eigen::Dynamic, Eigen::Dynamic>>, + std::unique_ptr<BlockSparseMatrix>> +CreateFakeBundleAdjustmentPartitionedJacobian(int num_cameras, + int num_points, + int camera_size, + int landmark_size, + double visibility, + std::mt19937& rng); + } // namespace ceres::internal #endif // CERES_INTERNAL_FAKE_BUNDLE_ADJUSTMENT_JACOBIAN
diff --git a/internal/ceres/implicit_schur_complement.cc b/internal/ceres/implicit_schur_complement.cc index 9b7ee47..bbd5e4b 100644 --- a/internal/ceres/implicit_schur_complement.cc +++ b/internal/ceres/implicit_schur_complement.cc
@@ -105,7 +105,8 @@ double* y) const { // y1 = F x tmp_rows_.setZero(); - A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data()); + A_->RightMultiplyAndAccumulateF( + x, tmp_rows_.data(), options_.context, options_.num_threads); // y2 = E' y1 tmp_e_cols_.setZero(); @@ -114,11 +115,16 @@ // y3 = -(E'E)^-1 y2 tmp_e_cols_2_.setZero(); block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(), - tmp_e_cols_2_.data()); + tmp_e_cols_2_.data(), + options_.context, + options_.num_threads); tmp_e_cols_2_ *= -1.0; // y1 = y1 + E y3 - A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data()); + A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), + tmp_rows_.data(), + options_.context, + options_.num_threads); // y5 = D * x if (D_ != nullptr) { @@ -139,7 +145,8 @@ CHECK(compute_ftf_inverse_); // y1 = F x tmp_rows_.setZero(); - A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data()); + A_->RightMultiplyAndAccumulateF( + x, tmp_rows_.data(), options_.context, options_.num_threads); // y2 = E' y1 tmp_e_cols_.setZero(); @@ -148,18 +155,23 @@ // y3 = (E'E)^-1 y2 tmp_e_cols_2_.setZero(); block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(), - tmp_e_cols_2_.data()); + tmp_e_cols_2_.data(), + options_.context, + options_.num_threads); // y1 = E y3 tmp_rows_.setZero(); - A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data()); + A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), + tmp_rows_.data(), + options_.context, + options_.num_threads); // y4 = F' y1 tmp_f_cols_.setZero(); A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), tmp_f_cols_.data()); // y += (F'F)^-1 y4 - block_diagonal_FtF_inverse_->RightMultiplyAndAccumulate(tmp_f_cols_.data(), - y); + block_diagonal_FtF_inverse_->RightMultiplyAndAccumulate( + tmp_f_cols_.data(), y, options_.context, options_.num_threads); } // Given a block diagonal matrix and an optional array of diagonal @@ -197,7 +209,8 @@ // y1 = F x tmp_rows_.setZero(); - A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data()); + A_->RightMultiplyAndAccumulateF( + x, tmp_rows_.data(), options_.context, options_.num_threads); // y2 = b - y1 tmp_rows_ = ConstVectorRef(b_, num_rows) - tmp_rows_; @@ -208,8 +221,8 @@ // y = (E'E)^-1 y3 VectorRef(y, num_cols).setZero(); - block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(), - y); + block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate( + tmp_e_cols_.data(), y, options_.context, options_.num_threads); // The full solution vector y has two blocks. The first block of // variables corresponds to the eliminated variables, which we just @@ -232,12 +245,13 @@ // y2 = (E'E)^-1 y1 Vector y2 = Vector::Zero(A_->num_cols_e()); - block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(), - y2.data()); + block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate( + tmp_e_cols_.data(), y2.data(), options_.context, options_.num_threads); // y3 = E y2 tmp_rows_.setZero(); - A_->RightMultiplyAndAccumulateE(y2.data(), tmp_rows_.data()); + A_->RightMultiplyAndAccumulateE( + y2.data(), tmp_rows_.data(), options_.context, options_.num_threads); // y3 = b - y3 tmp_rows_ = ConstVectorRef(b_, A_->num_rows()) - tmp_rows_;
diff --git a/internal/ceres/linear_operator.cc b/internal/ceres/linear_operator.cc index 597a37e..1b518a6 100644 --- a/internal/ceres/linear_operator.cc +++ b/internal/ceres/linear_operator.cc
@@ -30,8 +30,34 @@ #include "ceres/linear_operator.h" +#include <glog/logging.h> + namespace ceres::internal { +void LinearOperator::RightMultiplyAndAccumulate(const double* x, + double* y, + ContextImpl* context, + int num_threads) const { + (void)context; + if (num_threads != 1) { + VLOG(3) << "Parallel right product is not supported by linear operator " + "implementation"; + } + RightMultiplyAndAccumulate(x, y); +} + +void LinearOperator::LeftMultiplyAndAccumulate(const double* x, + double* y, + ContextImpl* context, + int num_threads) const { + (void)context; + if (num_threads != 1) { + VLOG(3) << "Parallel left product is not supported by linear operator " + "implementation"; + } + LeftMultiplyAndAccumulate(x, y); +} + LinearOperator::~LinearOperator() = default; } // namespace ceres::internal
diff --git a/internal/ceres/linear_operator.h b/internal/ceres/linear_operator.h index 8a1e902..76beb40 100644 --- a/internal/ceres/linear_operator.h +++ b/internal/ceres/linear_operator.h
@@ -39,6 +39,8 @@ namespace ceres::internal { +class ContextImpl; + // This is an abstract base class for linear operators. It supports // access to size information and left and right multiply operators. class CERES_NO_EXPORT LinearOperator { @@ -47,8 +49,16 @@ // y = y + Ax; virtual void RightMultiplyAndAccumulate(const double* x, double* y) const = 0; + virtual void RightMultiplyAndAccumulate(const double* x, + double* y, + ContextImpl* context, + int num_threads) const; // y = y + A'x; virtual void LeftMultiplyAndAccumulate(const double* x, double* y) const = 0; + virtual void LeftMultiplyAndAccumulate(const double* x, + double* y, + ContextImpl* context, + int num_threads) const; virtual void RightMultiplyAndAccumulate(const Vector& x, Vector& y) const { RightMultiplyAndAccumulate(x.data(), y.data()); @@ -58,6 +68,20 @@ LeftMultiplyAndAccumulate(x.data(), y.data()); } + virtual void RightMultiplyAndAccumulate(const Vector& x, + Vector& y, + ContextImpl* context, + int num_threads) const { + RightMultiplyAndAccumulate(x.data(), y.data(), context, num_threads); + } + + virtual void LeftMultiplyAndAccumulate(const Vector& x, + Vector& y, + ContextImpl* context, + int num_threads) const { + LeftMultiplyAndAccumulate(x.data(), y.data(), context, num_threads); + } + virtual int num_rows() const = 0; virtual int num_cols() const = 0; };
diff --git a/internal/ceres/partitioned_matrix_view.h b/internal/ceres/partitioned_matrix_view.h index 7fb1a09..86106da 100644 --- a/internal/ceres/partitioned_matrix_view.h +++ b/internal/ceres/partitioned_matrix_view.h
@@ -52,6 +52,8 @@ namespace ceres::internal { +class ContextImpl; + // Given generalized bi-partite matrix A = [E F], with the same block // structure as required by the Schur complement based solver, found // in schur_complement_solver.h, provide access to the @@ -75,10 +77,18 @@ // y += Ex virtual void RightMultiplyAndAccumulateE(const double* x, double* y) const = 0; + virtual void RightMultiplyAndAccumulateE(const double* x, + double* y, + ContextImpl* context, + int num_threads) const = 0; // y += Fx virtual void RightMultiplyAndAccumulateF(const double* x, double* y) const = 0; + virtual void RightMultiplyAndAccumulateF(const double* x, + double* y, + ContextImpl* context, + int num_threads) const = 0; // Create and return the block diagonal of the matrix E'E. virtual std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalEtE() const = 0; @@ -130,6 +140,14 @@ void LeftMultiplyAndAccumulateF(const double* x, double* y) const final; void RightMultiplyAndAccumulateE(const double* x, double* y) const final; void RightMultiplyAndAccumulateF(const double* x, double* y) const final; + void RightMultiplyAndAccumulateE(const double* x, + double* y, + ContextImpl* context, + int num_threads) const final; + void RightMultiplyAndAccumulateF(const double* x, + double* y, + ContextImpl* context, + int num_threads) const final; std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalEtE() const final; std::unique_ptr<BlockSparseMatrix> CreateBlockDiagonalFtF() const final; void UpdateBlockDiagonalEtE(BlockSparseMatrix* block_diagonal) const final;
diff --git a/internal/ceres/partitioned_matrix_view_benchmark.cc b/internal/ceres/partitioned_matrix_view_benchmark.cc new file mode 100644 index 0000000..63d0e34 --- /dev/null +++ b/internal/ceres/partitioned_matrix_view_benchmark.cc
@@ -0,0 +1,147 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2022 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +#include <memory> +#include <random> + +#include "benchmark/benchmark.h" +#include "ceres/context_impl.h" +#include "ceres/fake_bundle_adjustment_jacobian.h" +#include "ceres/partitioned_matrix_view.h" + +constexpr int kNumCameras = 1000; +constexpr int kNumPoints = 10000; +constexpr int kCameraSize = 6; +constexpr int kPointSize = 3; +constexpr double kVisibility = 0.1; + +namespace ceres::internal { + +static void BM_PatitionedViewRightMultiplyAndAccumulateE_Static( + benchmark::State& state) { + const int num_threads = state.range(0); + std::mt19937 prng; + auto [partitioned_view, jacobian] = + CreateFakeBundleAdjustmentPartitionedJacobian<kPointSize, kCameraSize>( + kNumCameras, kNumPoints, kVisibility, prng); + + ContextImpl context; + context.EnsureMinimumThreads(num_threads); + + Vector x(partitioned_view->num_cols_e()); + Vector y(partitioned_view->num_rows()); + x.setRandom(); + y.setRandom(); + double sum = 0; + for (auto _ : state) { + partitioned_view->RightMultiplyAndAccumulateE( + x.data(), y.data(), &context, num_threads); + sum += y.norm(); + } + CHECK_NE(sum, 0.0); +} +BENCHMARK(BM_PatitionedViewRightMultiplyAndAccumulateE_Static) + ->Arg(1) + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16); + +static void BM_PatitionedViewRightMultiplyAndAccumulateE_Dynamic( + benchmark::State& state) { + std::mt19937 prng; + auto [partitioned_view, jacobian] = + CreateFakeBundleAdjustmentPartitionedJacobian( + kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); + + Vector x(partitioned_view->num_cols_e()); + Vector y(partitioned_view->num_rows()); + x.setRandom(); + y.setRandom(); + double sum = 0; + for (auto _ : state) { + partitioned_view->RightMultiplyAndAccumulateE(x.data(), y.data()); + sum += y.norm(); + } + CHECK_NE(sum, 0.0); +} +BENCHMARK(BM_PatitionedViewRightMultiplyAndAccumulateE_Dynamic); + +static void BM_PatitionedViewRightMultiplyAndAccumulateF_Static( + benchmark::State& state) { + const int num_threads = state.range(0); + std::mt19937 prng; + auto [partitioned_view, jacobian] = + CreateFakeBundleAdjustmentPartitionedJacobian<kPointSize, kCameraSize>( + kNumCameras, kNumPoints, kVisibility, prng); + + ContextImpl context; + context.EnsureMinimumThreads(num_threads); + + Vector x(partitioned_view->num_cols_f()); + Vector y(partitioned_view->num_rows()); + x.setRandom(); + y.setRandom(); + double sum = 0; + for (auto _ : state) { + partitioned_view->RightMultiplyAndAccumulateF( + x.data(), y.data(), &context, num_threads); + sum += y.norm(); + } + CHECK_NE(sum, 0.0); +} +BENCHMARK(BM_PatitionedViewRightMultiplyAndAccumulateF_Static) + ->Arg(1) + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16); + +static void BM_PatitionedViewRightMultiplyAndAccumulateF_Dynamic( + benchmark::State& state) { + std::mt19937 prng; + auto [partitioned_view, jacobian] = + CreateFakeBundleAdjustmentPartitionedJacobian( + kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); + + Vector x(partitioned_view->num_cols_f()); + Vector y(partitioned_view->num_rows()); + x.setRandom(); + y.setRandom(); + double sum = 0; + for (auto _ : state) { + partitioned_view->RightMultiplyAndAccumulateF(x.data(), y.data()); + sum += y.norm(); + } + CHECK_NE(sum, 0.0); +} +BENCHMARK(BM_PatitionedViewRightMultiplyAndAccumulateF_Dynamic); + +} // namespace ceres::internal + +BENCHMARK_MAIN();
diff --git a/internal/ceres/partitioned_matrix_view_impl.h b/internal/ceres/partitioned_matrix_view_impl.h index 2150660..fea0a7d 100644 --- a/internal/ceres/partitioned_matrix_view_impl.h +++ b/internal/ceres/partitioned_matrix_view_impl.h
@@ -36,6 +36,7 @@ #include "ceres/block_sparse_matrix.h" #include "ceres/block_structure.h" #include "ceres/internal/eigen.h" +#include "ceres/parallel_for.h" #include "ceres/partitioned_matrix_view.h" #include "ceres/small_blas.h" #include "glog/logging.h" @@ -88,71 +89,100 @@ template <int kRowBlockSize, int kEBlockSize, int kFBlockSize> void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>:: RightMultiplyAndAccumulateE(const double* x, double* y) const { - const CompressedRowBlockStructure* bs = matrix_.block_structure(); + RightMultiplyAndAccumulateE(x, y, nullptr, 1); +} +template <int kRowBlockSize, int kEBlockSize, int kFBlockSize> +void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>:: + RightMultiplyAndAccumulateE(const double* x, + double* y, + ContextImpl* context, + int num_threads) const { // Iterate over the first num_row_blocks_e_ row blocks, and multiply // by the first cell in each row block. + auto bs = matrix_.block_structure(); const double* values = matrix_.values(); - for (int r = 0; r < num_row_blocks_e_; ++r) { - const Cell& cell = bs->rows[r].cells[0]; - const int row_block_pos = bs->rows[r].block.position; - const int row_block_size = bs->rows[r].block.size; - const int col_block_id = cell.block_id; - const int col_block_pos = bs->cols[col_block_id].position; - const int col_block_size = bs->cols[col_block_id].size; - // clang-format off - MatrixVectorMultiply<kRowBlockSize, kEBlockSize, 1>( - values + cell.position, row_block_size, col_block_size, - x + col_block_pos, - y + row_block_pos); - // clang-format on - } + ParallelFor(context, + 0, + num_row_blocks_e_, + num_threads, + [values, bs, x, y](int row_block_id) { + const Cell& cell = bs->rows[row_block_id].cells[0]; + const int row_block_pos = bs->rows[row_block_id].block.position; + const int row_block_size = bs->rows[row_block_id].block.size; + const int col_block_id = cell.block_id; + const int col_block_pos = bs->cols[col_block_id].position; + const int col_block_size = bs->cols[col_block_id].size; + // clang-format off + MatrixVectorMultiply<kRowBlockSize, kEBlockSize, 1>( + values + cell.position, row_block_size, col_block_size, + x + col_block_pos, + y + row_block_pos); + // clang-format on + }); } template <int kRowBlockSize, int kEBlockSize, int kFBlockSize> void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>:: RightMultiplyAndAccumulateF(const double* x, double* y) const { - const CompressedRowBlockStructure* bs = matrix_.block_structure(); + RightMultiplyAndAccumulateF(x, y, nullptr, 1); +} +template <int kRowBlockSize, int kEBlockSize, int kFBlockSize> +void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>:: + RightMultiplyAndAccumulateF(const double* x, + double* y, + ContextImpl* context, + int num_threads) const { // Iterate over row blocks, and if the row block is in E, then // multiply by all the cells except the first one which is of type // E. If the row block is not in E (i.e its in the bottom // num_row_blocks - num_row_blocks_e row blocks), then all the cells // are of type F and multiply by them all. + const CompressedRowBlockStructure* bs = matrix_.block_structure(); + const int num_row_blocks = bs->rows.size(); + const int num_cols_e = num_cols_e_; const double* values = matrix_.values(); - for (int r = 0; r < num_row_blocks_e_; ++r) { - const int row_block_pos = bs->rows[r].block.position; - const int row_block_size = bs->rows[r].block.size; - const std::vector<Cell>& cells = bs->rows[r].cells; - for (int c = 1; c < cells.size(); ++c) { - const int col_block_id = cells[c].block_id; - const int col_block_pos = bs->cols[col_block_id].position; - const int col_block_size = bs->cols[col_block_id].size; - // clang-format off - MatrixVectorMultiply<kRowBlockSize, kFBlockSize, 1>( - values + cells[c].position, row_block_size, col_block_size, - x + col_block_pos - num_cols_e_, - y + row_block_pos); - // clang-format on - } - } - - for (int r = num_row_blocks_e_; r < bs->rows.size(); ++r) { - const int row_block_pos = bs->rows[r].block.position; - const int row_block_size = bs->rows[r].block.size; - const std::vector<Cell>& cells = bs->rows[r].cells; - for (const auto& cell : cells) { - const int col_block_id = cell.block_id; - const int col_block_pos = bs->cols[col_block_id].position; - const int col_block_size = bs->cols[col_block_id].size; - // clang-format off - MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>( - values + cell.position, row_block_size, col_block_size, - x + col_block_pos - num_cols_e_, - y + row_block_pos); - // clang-format on - } - } + ParallelFor(context, + 0, + num_row_blocks_e_, + num_threads, + [values, bs, num_cols_e, x, y](int row_block_id) { + const int row_block_pos = bs->rows[row_block_id].block.position; + const int row_block_size = bs->rows[row_block_id].block.size; + const auto& cells = bs->rows[row_block_id].cells; + for (int c = 1; c < cells.size(); ++c) { + const int col_block_id = cells[c].block_id; + const int col_block_pos = bs->cols[col_block_id].position; + const int col_block_size = bs->cols[col_block_id].size; + // clang-format off + MatrixVectorMultiply<kRowBlockSize, kFBlockSize, 1>( + values + cells[c].position, row_block_size, col_block_size, + x + col_block_pos - num_cols_e, + y + row_block_pos); + // clang-format on + } + }); + ParallelFor(context, + num_row_blocks_e_, + num_row_blocks, + num_threads, + [values, bs, num_cols_e, x, y](int row_block_id) { + const int row_block_pos = bs->rows[row_block_id].block.position; + const int row_block_size = bs->rows[row_block_id].block.size; + const auto& cells = bs->rows[row_block_id].cells; + for (const auto& cell : cells) { + const int col_block_id = cell.block_id; + const int col_block_pos = bs->cols[col_block_id].position; + const int col_block_size = bs->cols[col_block_id].size; + // clang-format off + MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>( + values + cell.position, row_block_size, col_block_size, + x + col_block_pos - num_cols_e, + y + row_block_pos); + // clang-format on + } + }); } template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
diff --git a/internal/ceres/partitioned_matrix_view_test.cc b/internal/ceres/partitioned_matrix_view_test.cc index e4484dd..b11ea8e 100644 --- a/internal/ceres/partitioned_matrix_view_test.cc +++ b/internal/ceres/partitioned_matrix_view_test.cc
@@ -174,5 +174,64 @@ EXPECT_NEAR(block_diagonal_ff->values()[2], 37.0, kEpsilon); } +const int kMaxNumThreads = 8; +class PartitionedMatrixViewParallelTest : public ::testing::TestWithParam<int> { + protected: + void SetUp() final { + std::unique_ptr<LinearLeastSquaresProblem> problem = + CreateLinearLeastSquaresProblemFromId(2); + CHECK(problem != nullptr); + A_ = std::move(problem->A); + + num_cols_ = A_->num_cols(); + num_rows_ = A_->num_rows(); + num_eliminate_blocks_ = problem->num_eliminate_blocks; + LinearSolver::Options options; + options.elimination_groups.push_back(num_eliminate_blocks_); + pmv_ = PartitionedMatrixViewBase::Create( + options, *down_cast<BlockSparseMatrix*>(A_.get())); + context_.EnsureMinimumThreads(kMaxNumThreads); + } + + double RandDouble() { return distribution_(prng_); } + + ContextImpl context_; + int num_rows_; + int num_cols_; + int num_eliminate_blocks_; + std::unique_ptr<SparseMatrix> A_; + std::unique_ptr<PartitionedMatrixViewBase> pmv_; + std::mt19937 prng_; + std::uniform_real_distribution<double> distribution_ = + std::uniform_real_distribution<double>(0.0, 1.0); +}; + +TEST_P(PartitionedMatrixViewParallelTest, RightMultiplyAndAccumulateEParallel) { + const int kNumThreads = GetParam(); + Vector x1(pmv_->num_cols_e()); + Vector x2(pmv_->num_cols()); + x2.setZero(); + + for (int i = 0; i < pmv_->num_cols_e(); ++i) { + x1(i) = x2(i) = RandDouble(); + } + + Vector y1 = Vector::Zero(pmv_->num_rows()); + pmv_->RightMultiplyAndAccumulateE( + x1.data(), y1.data(), &context_, kNumThreads); + + Vector y2 = Vector::Zero(pmv_->num_rows()); + A_->RightMultiplyAndAccumulate(x2.data(), y2.data()); + + for (int i = 0; i < pmv_->num_rows(); ++i) { + EXPECT_NEAR(y1(i), y2(i), kEpsilon); + } +} + +INSTANTIATE_TEST_SUITE_P(ParallelProducts, + PartitionedMatrixViewParallelTest, + ::testing::Values(1, 2, 4, 8), + ::testing::PrintToStringParamName()); + } // namespace internal } // namespace ceres
diff --git a/internal/ceres/power_series_expansion_preconditioner_test.cc b/internal/ceres/power_series_expansion_preconditioner_test.cc index a291405..233d4f7 100644 --- a/internal/ceres/power_series_expansion_preconditioner_test.cc +++ b/internal/ceres/power_series_expansion_preconditioner_test.cc
@@ -45,10 +45,9 @@ const auto A = down_cast<BlockSparseMatrix*>(problem_->A.get()); const auto D = problem_->D.get(); - LinearSolver::Options options; - options.elimination_groups.push_back(problem_->num_eliminate_blocks); - options.preconditioner_type = SCHUR_POWER_SERIES_EXPANSION; - isc_ = std::make_unique<ImplicitSchurComplement>(options); + options_.elimination_groups.push_back(problem_->num_eliminate_blocks); + options_.preconditioner_type = SCHUR_POWER_SERIES_EXPANSION; + isc_ = std::make_unique<ImplicitSchurComplement>(options_); isc_->Init(*A, D, problem_->b.get()); num_f_cols_ = isc_->rhs().rows(); const int num_rows = A->num_rows(), num_cols = A->num_cols(), @@ -76,6 +75,7 @@ std::unique_ptr<ImplicitSchurComplement> isc_; int num_f_cols_; Matrix sc_inverse_expected_; + LinearSolver::Options options_; }; TEST_F(PowerSeriesExpansionPreconditionerTest, @@ -168,4 +168,4 @@ } } -} // namespace ceres::internal \ No newline at end of file +} // namespace ceres::internal
diff --git a/internal/ceres/sparse_matrix.h b/internal/ceres/sparse_matrix.h index da6af18..e665c8b 100644 --- a/internal/ceres/sparse_matrix.h +++ b/internal/ceres/sparse_matrix.h
@@ -41,6 +41,7 @@ #include "ceres/types.h" namespace ceres::internal { +struct ContextImpl; // This class defines the interface for storing and manipulating // sparse matrices. The key property that differentiates different
diff --git a/internal/ceres/spmv_benchmark.cc b/internal/ceres/spmv_benchmark.cc index 0bd1e43..69df1ea 100644 --- a/internal/ceres/spmv_benchmark.cc +++ b/internal/ceres/spmv_benchmark.cc
@@ -66,26 +66,37 @@ static void BM_BlockSparseRightMultiplyAndAccumulateBA( benchmark::State& state) { + const int num_threads = state.range(0); std::mt19937 prng; auto jacobian = CreateFakeBundleAdjustmentJacobian( kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); + ContextImpl context; + context.EnsureMinimumThreads(num_threads); + Vector x(jacobian->num_cols()); Vector y(jacobian->num_rows()); x.setRandom(); y.setRandom(); double sum = 0; for (auto _ : state) { - jacobian->RightMultiplyAndAccumulate(x.data(), y.data()); + jacobian->RightMultiplyAndAccumulate( + x.data(), y.data(), &context, num_threads); sum += y.norm(); } CHECK_NE(sum, 0.0); } -BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateBA); +BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateBA) + ->Arg(1) + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16); static void BM_BlockSparseRightMultiplyAndAccumulateUnstructured( benchmark::State& state) { + const int num_threads = state.range(0); BlockSparseMatrix::RandomMatrixOptions options; options.num_row_blocks = kNumRowBlocks; options.num_col_blocks = kNumColBlocks; @@ -98,19 +109,28 @@ auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng); + ContextImpl context; + context.EnsureMinimumThreads(num_threads); + Vector x(jacobian->num_cols()); Vector y(jacobian->num_rows()); x.setRandom(); y.setRandom(); double sum = 0; for (auto _ : state) { - jacobian->RightMultiplyAndAccumulate(x.data(), y.data()); + jacobian->RightMultiplyAndAccumulate( + x.data(), y.data(), &context, num_threads); sum += y.norm(); } CHECK_NE(sum, 0.0); } -BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateUnstructured); +BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateUnstructured) + ->Arg(1) + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16); static void BM_BlockSparseLeftMultiplyAndAccumulateBA(benchmark::State& state) { std::mt19937 prng; @@ -158,6 +178,7 @@ BENCHMARK(BM_BlockSparseLeftMultiplyAndAccumulateUnstructured); static void BM_CRSRightMultiplyAndAccumulateBA(benchmark::State& state) { + const int num_threads = state.range(0); std::mt19937 prng; auto bsm_jacobian = CreateFakeBundleAdjustmentJacobian( kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); @@ -167,22 +188,32 @@ bsm_jacobian->num_nonzeros()); bsm_jacobian->ToCompressedRowSparseMatrix(&jacobian); + ContextImpl context; + context.EnsureMinimumThreads(num_threads); + Vector x(jacobian.num_cols()); Vector y(jacobian.num_rows()); x.setRandom(); y.setRandom(); double sum = 0; for (auto _ : state) { - jacobian.RightMultiplyAndAccumulate(x.data(), y.data()); + jacobian.RightMultiplyAndAccumulate( + x.data(), y.data(), &context, num_threads); sum += y.norm(); } CHECK_NE(sum, 0.0); } -BENCHMARK(BM_CRSRightMultiplyAndAccumulateBA); +BENCHMARK(BM_CRSRightMultiplyAndAccumulateBA) + ->Arg(1) + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16); static void BM_CRSRightMultiplyAndAccumulateUnstructured( benchmark::State& state) { + const int num_threads = state.range(0); BlockSparseMatrix::RandomMatrixOptions options; options.num_row_blocks = kNumRowBlocks; options.num_col_blocks = kNumColBlocks; @@ -199,19 +230,28 @@ bsm_jacobian->num_nonzeros()); bsm_jacobian->ToCompressedRowSparseMatrix(&jacobian); + ContextImpl context; + context.EnsureMinimumThreads(num_threads); + Vector x(jacobian.num_cols()); Vector y(jacobian.num_rows()); x.setRandom(); y.setRandom(); double sum = 0; for (auto _ : state) { - jacobian.RightMultiplyAndAccumulate(x.data(), y.data()); + jacobian.RightMultiplyAndAccumulate( + x.data(), y.data(), &context, num_threads); sum += y.norm(); } CHECK_NE(sum, 0.0); } -BENCHMARK(BM_CRSRightMultiplyAndAccumulateUnstructured); +BENCHMARK(BM_CRSRightMultiplyAndAccumulateUnstructured) + ->Arg(1) + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16); static void BM_CRSLeftMultiplyAndAccumulateBA(benchmark::State& state) { std::mt19937 prng;