Parallel operations on vectors
Main focus of this change is to parallelize remaining operations (most of them
are operations on vectors) in code-path utilized with iterative Schur
complement.
Parallelization is handled using lazy evaluation of Eigen expressions.
On linux pc with intel 8176 processor parallelization of vector operations has
the following effect:
Running ./bin/parallel_vector_operations_benchmark
Run on (112 X 3200.32 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x56)
L1 Instruction 32 KiB (x56)
L2 Unified 1024 KiB (x56)
L3 Unified 39424 KiB (x2)
Load Average: 3.30, 8.41, 11.82
-----------------------------------
Benchmark Time
-----------------------------------
SetZero 10009532 ns
SetZeroParallel/1 10024139 ns
...
SetZeroParallel/16 877606 ns
Negate 4978856 ns
NegateParallel/1 5145413 ns
...
NegateParallel/16 721823 ns
Assign 10731408 ns
AssignParallel/1 10749944 ns
...
AssignParallel/16 1829381 ns
D2X 15214399 ns
D2XParallel/1 15623245 ns
...
D2XParallel/16 2687060 ns
DivideSqrt 8220050 ns
DivideSqrtParallel/1 9088467 ns
...
DivideSqrtParallel/16 905569 ns
Clamp 3502010 ns
ClampParallel/1 4507897 ns
...
ClampParallel/16 759576 ns
Norm 4426782 ns
NormParallel/1 4442805 ns
...
NormParallel/16 430290 ns
Dot 9023276 ns
DotParallel/1 9031304 ns
...
DotParallel/16 1157267 ns
Axpby 14608289 ns
AxpbyParallel/1 14570825 ns
...
AxpbyParallel/16 2672220 ns
-----------------------------------
Multi-threading of vector operations in ISC and program evaluation results into
the following improvement:
Running ./bin/evaluation_benchmark
--------------------------------------------------------------------------------------
Benchmark this 2fd81de
--------------------------------------------------------------------------------------
Residuals<problem-13682-4456117-pre.txt>/1 4136 ms 4292 ms
Residuals<problem-13682-4456117-pre.txt>/2 2919 ms 2670 ms
Residuals<problem-13682-4456117-pre.txt>/4 2065 ms 2198 ms
Residuals<problem-13682-4456117-pre.txt>/8 1458 ms 1609 ms
Residuals<problem-13682-4456117-pre.txt>/16 1152 ms 1227 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/1 19759 ms 20084 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/2 10921 ms 10977 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/4 6220 ms 6941 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/8 3490 ms 4398 ms
ResidualsAndJacobian<problem-13682-4456117-pre.txt>/16 2277 ms 3172 ms
Plus<problem-13682-4456117-pre.txt>/1 339 ms 322 ms
Plus<problem-13682-4456117-pre.txt>/2 220 ms
Plus<problem-13682-4456117-pre.txt>/4 128 ms
Plus<problem-13682-4456117-pre.txt>/8 78.0 ms
Plus<problem-13682-4456117-pre.txt>/16 49.8 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/1 2434 ms 2478 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/2 2706 ms 2688 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/4 1430 ms 1548 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/8 742 ms 883 ms
ISCRightMultiplyAndAccumulate<problem-13682-4456117-pre.txt>/16 438 ms 555 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/1 2438 ms 2481 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/2 2565 ms 2790 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/4 1434 ms 1551 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/8 765 ms 892 ms
ISCRightMultiplyAndAccumulateDiag<problem-13682-4456117-pre.txt>/16 435 ms 559 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/1 1278 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/2 1555 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/4 833 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/8 459 ms
JacobianSquaredColumnNorm<problem-13682-4456117-pre.txt>/16 250 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/1 1468 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/2 1871 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/4 957 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/8 528 ms
JacobianScaleColumns<problem-13682-4456117-pre.txt>/16 294 ms
End-to-end improvements with bundle_adjuster invoked with
./bin/bundle_adjuster --num_threads 28 --num_iterations 40 \
--linear_solver iterative_schur \
--preconditioner jacobi --input
---------------------------------------------
Problem this 2fd81de
---------------------------------------------
problem-13682-4456117-pre.txt 508.6 892.7
problem-1778-993923-pre.txt 763.8 1129.9
problem-1723-156502-pre.txt 6.3 14.4
problem-356-226730-pre.txt 76.3 116.2
problem-257-65132-pre.txt 38.6 52.0
Change-Id: Ie31cc5015f13fa479c16ffb5ce48c9b880990d49
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 1e6a699..23dbbd6 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -589,6 +589,9 @@
add_executable(spmv_benchmark spmv_benchmark.cc)
add_dependencies_to_benchmark(spmv_benchmark)
+ add_executable(parallel_vector_operations_benchmark parallel_vector_operations_benchmark.cc)
+ add_dependencies_to_benchmark(parallel_vector_operations_benchmark)
+
add_executable(block_jacobi_preconditioner_benchmark
block_jacobi_preconditioner_benchmark.cc)
add_dependencies_to_benchmark(block_jacobi_preconditioner_benchmark)
diff --git a/internal/ceres/block_sparse_matrix.cc b/internal/ceres/block_sparse_matrix.cc
index 459d709..c81ea39 100644
--- a/internal/ceres/block_sparse_matrix.cc
+++ b/internal/ceres/block_sparse_matrix.cc
@@ -94,6 +94,7 @@
values_ = std::make_unique<double[]>(num_nonzeros_);
max_num_nonzeros_ = num_nonzeros_;
CHECK(values_ != nullptr);
+ AddTransposeBlockStructure();
}
void BlockSparseMatrix::AddTransposeBlockStructure() {
@@ -106,6 +107,10 @@
std::fill(values_.get(), values_.get() + num_nonzeros_, 0.0);
}
+void BlockSparseMatrix::SetZero(ContextImpl* context, int num_threads) {
+ ParallelSetZero(context, num_threads, values_.get(), num_nonzeros_);
+}
+
void BlockSparseMatrix::RightMultiplyAndAccumulate(const double* x,
double* y) const {
RightMultiplyAndAccumulate(x, y, nullptr, 1);
@@ -148,6 +153,7 @@
});
}
+// TODO: This method might benefit from caching column-block partition
void BlockSparseMatrix::LeftMultiplyAndAccumulate(const double* x,
double* y,
ContextImpl* context,
@@ -238,6 +244,40 @@
}
}
+// TODO: This method might benefit from caching column-block partition
+void BlockSparseMatrix::SquaredColumnNorm(double* x,
+ ContextImpl* context,
+ int num_threads) const {
+ if (transpose_block_structure_ == nullptr || num_threads == 1) {
+ SquaredColumnNorm(x);
+ return;
+ }
+
+ CHECK(x != nullptr);
+ ParallelSetZero(context, num_threads, x, num_cols_);
+
+ auto transpose_bs = transpose_block_structure_.get();
+ const auto values = values_.get();
+ const int num_col_blocks = transpose_bs->rows.size();
+ ParallelFor(
+ context,
+ 0,
+ num_col_blocks,
+ num_threads,
+ [values, transpose_bs, x](int row_block_id) {
+ const auto& row = transpose_bs->rows[row_block_id];
+
+ for (auto& cell : row.cells) {
+ const auto& col = transpose_bs->cols[cell.block_id];
+ const MatrixRef m(values + cell.position, col.size, row.block.size);
+ VectorRef(x + row.block.position, row.block.size) +=
+ m.colwise().squaredNorm();
+ }
+ },
+ transpose_bs->rows.data(),
+ [](const CompressedRow& row) { return row.cumulative_nnz; });
+}
+
void BlockSparseMatrix::ScaleColumns(const double* scale) {
CHECK(scale != nullptr);
@@ -255,6 +295,38 @@
}
}
+// TODO: This method might benefit from caching column-block partition
+void BlockSparseMatrix::ScaleColumns(const double* scale,
+ ContextImpl* context,
+ int num_threads) {
+ if (transpose_block_structure_ == nullptr || num_threads == 1) {
+ ScaleColumns(scale);
+ return;
+ }
+
+ CHECK(scale != nullptr);
+ auto transpose_bs = transpose_block_structure_.get();
+ auto values = values_.get();
+ const int num_col_blocks = transpose_bs->rows.size();
+ ParallelFor(
+ context,
+ 0,
+ num_col_blocks,
+ num_threads,
+ [values, transpose_bs, scale](int row_block_id) {
+ const auto& row = transpose_bs->rows[row_block_id];
+
+ for (auto& cell : row.cells) {
+ const auto& col = transpose_bs->cols[cell.block_id];
+ MatrixRef m(values + cell.position, col.size, row.block.size);
+ m *= ConstVectorRef(scale + row.block.position, row.block.size)
+ .asDiagonal();
+ }
+ },
+ transpose_bs->rows.data(),
+ [](const CompressedRow& row) { return row.cumulative_nnz; });
+}
+
void BlockSparseMatrix::ToCompressedRowSparseMatrix(
CompressedRowSparseMatrix* crs_matrix) const {
{
diff --git a/internal/ceres/block_sparse_matrix.h b/internal/ceres/block_sparse_matrix.h
index 72b1d68..ed3146f 100644
--- a/internal/ceres/block_sparse_matrix.h
+++ b/internal/ceres/block_sparse_matrix.h
@@ -71,7 +71,8 @@
void operator=(const BlockSparseMatrix&) = delete;
// Implementation of SparseMatrix interface.
- void SetZero() final;
+ void SetZero() override final;
+ void SetZero(ContextImpl* context, int num_threads) override final;
void RightMultiplyAndAccumulate(const double* x, double* y) const final;
void RightMultiplyAndAccumulate(const double* x,
double* y,
@@ -83,7 +84,13 @@
ContextImpl* context,
int num_threads) const final;
void SquaredColumnNorm(double* x) const final;
+ void SquaredColumnNorm(double* x,
+ ContextImpl* context,
+ int num_threads) const final;
void ScaleColumns(const double* scale) final;
+ void ScaleColumns(const double* scale,
+ ContextImpl* context,
+ int num_threads) final;
void ToCompressedRowSparseMatrix(CompressedRowSparseMatrix* matrix) const;
void ToDenseMatrix(Matrix* dense_matrix) const final;
void ToTextFile(FILE* file) const final;
diff --git a/internal/ceres/block_sparse_matrix_test.cc b/internal/ceres/block_sparse_matrix_test.cc
index 6533a80..0529a2d 100644
--- a/internal/ceres/block_sparse_matrix_test.cc
+++ b/internal/ceres/block_sparse_matrix_test.cc
@@ -136,10 +136,23 @@
CHECK_EQ(A_->num_cols(), B_->num_cols());
CHECK_EQ(A_->num_nonzeros(), B_->num_nonzeros());
context_.EnsureMinimumThreads(kNumThreads);
+
+ BlockSparseMatrix::RandomMatrixOptions options;
+ options.num_row_blocks = 1000;
+ options.min_row_block_size = 1;
+ options.max_row_block_size = 8;
+ options.num_col_blocks = 100;
+ options.min_col_block_size = 1;
+ options.max_col_block_size = 8;
+ options.block_density = 0.05;
+
+ std::mt19937 rng;
+ C_ = BlockSparseMatrix::CreateRandomMatrix(options, rng);
}
std::unique_ptr<BlockSparseMatrix> A_;
std::unique_ptr<TripletSparseMatrix> B_;
+ std::unique_ptr<BlockSparseMatrix> C_;
ContextImpl context_;
};
@@ -187,14 +200,13 @@
}
TEST_F(BlockSparseMatrixTest, LeftMultiplyAndAccumulateParallelTest) {
- Vector y_0 = Vector::Random(A_->num_rows());
+ Vector y_0 = Vector::Random(A_->num_cols());
Vector y_s = y_0;
Vector y_p = y_0;
- Vector x = Vector::Random(A_->num_cols());
+ Vector x = Vector::Random(A_->num_rows());
A_->LeftMultiplyAndAccumulate(x.data(), y_s.data());
- A_->AddTransposeBlockStructure();
A_->LeftMultiplyAndAccumulate(x.data(), y_p.data(), &context_, kNumThreads);
// Parallel implementation for left products uses a different order of
@@ -210,6 +222,47 @@
EXPECT_LT((y_a - y_b).norm(), 1e-12);
}
+TEST_F(BlockSparseMatrixTest, SquaredColumnNormParallelTest) {
+ Vector y_a = Vector::Zero(C_->num_cols());
+ Vector y_b = Vector::Zero(C_->num_cols());
+ C_->SquaredColumnNorm(y_a.data());
+
+ C_->SquaredColumnNorm(y_b.data(), &context_, kNumThreads);
+ EXPECT_LT((y_a - y_b).norm(), 1e-12);
+}
+
+TEST_F(BlockSparseMatrixTest, ScaleColumnsTest) {
+ const Vector scale = Vector::Random(C_->num_cols()).cwiseAbs();
+
+ const Vector x = Vector::Random(C_->num_rows());
+ Vector y_expected = Vector::Zero(C_->num_cols());
+ C_->LeftMultiplyAndAccumulate(x.data(), y_expected.data());
+ y_expected.array() *= scale.array();
+
+ C_->ScaleColumns(scale.data());
+ Vector y_observed = Vector::Zero(C_->num_cols());
+ C_->LeftMultiplyAndAccumulate(x.data(), y_observed.data());
+
+ EXPECT_GT(y_expected.norm(), 1.);
+ EXPECT_LT((y_observed - y_expected).norm(), 1e-12 * y_expected.norm());
+}
+
+TEST_F(BlockSparseMatrixTest, ScaleColumnsParallelTest) {
+ const Vector scale = Vector::Random(C_->num_cols()).cwiseAbs();
+
+ const Vector x = Vector::Random(C_->num_rows());
+ Vector y_expected = Vector::Zero(C_->num_cols());
+ C_->LeftMultiplyAndAccumulate(x.data(), y_expected.data());
+ y_expected.array() *= scale.array();
+
+ C_->ScaleColumns(scale.data(), &context_, kNumThreads);
+ Vector y_observed = Vector::Zero(C_->num_cols());
+ C_->LeftMultiplyAndAccumulate(x.data(), y_observed.data());
+
+ EXPECT_GT(y_expected.norm(), 1.);
+ EXPECT_LT((y_observed - y_expected).norm(), 1e-12 * y_expected.norm());
+}
+
TEST_F(BlockSparseMatrixTest, ToDenseMatrixTest) {
Matrix m_a;
Matrix m_b;
@@ -251,7 +304,6 @@
std::unique_ptr<BlockSparseMatrix> m(
down_cast<BlockSparseMatrix*>(problem->A.release()));
- A_->AddTransposeBlockStructure();
auto block_structure = A_->block_structure();
// Several AppendRows and DeleteRowBlocks operations are applied to matrix,
@@ -486,7 +538,6 @@
*ap_bs = *a->block_structure();
BlockSparseMatrix ap(ap_bs.release());
std::copy_n(a->values(), a->num_nonzeros(), ap.mutable_values());
- ap.AddTransposeBlockStructure();
Vector x = Vector::Random(a->num_cols());
Vector y = Vector::Random(a->num_rows());
diff --git a/internal/ceres/cgnr_solver.cc b/internal/ceres/cgnr_solver.cc
index 524356a..a1a3c6e 100644
--- a/internal/ceres/cgnr_solver.cc
+++ b/internal/ceres/cgnr_solver.cc
@@ -104,7 +104,11 @@
// y = y + DtDx
if (D_ != nullptr) {
int n = A_.num_cols();
- y.array() += ConstVectorRef(D_, n).array().square() * x.array();
+ ParallelAssign(
+ context_,
+ num_threads_,
+ y,
+ y.array() + ConstVectorRef(D_, n).array().square() * x.array());
}
}
@@ -156,6 +160,8 @@
preconditioner_options.context = options_.context;
if (options_.preconditioner_type == JACOBI) {
+ // TODO: BlockSparseJacobiPreconditioner::RightMultiply will benefit from
+ // multithreading
preconditioner_ = std::make_unique<BlockSparseJacobiPreconditioner>(
preconditioner_options, *A);
} else if (options_.preconditioner_type == SUBSET) {
@@ -165,9 +171,6 @@
preconditioner_ = std::make_unique<IdentityPreconditioner>(A->num_cols());
}
}
- if (options_.num_threads > 1) {
- A->AddTransposeBlockStructure();
- }
preconditioner_->Update(*A, per_solve_options.D);
ConjugateGradientsSolverOptions cg_options;
@@ -176,6 +179,8 @@
cg_options.residual_reset_period = options_.residual_reset_period;
cg_options.q_tolerance = per_solve_options.q_tolerance;
cg_options.r_tolerance = per_solve_options.r_tolerance;
+ cg_options.context = options_.context;
+ cg_options.num_threads = options_.num_threads;
// lhs = AtA + DtD
CgnrLinearOperator lhs(
diff --git a/internal/ceres/conjugate_gradients_solver.h b/internal/ceres/conjugate_gradients_solver.h
index 0c3581b..7da63c0 100644
--- a/internal/ceres/conjugate_gradients_solver.h
+++ b/internal/ceres/conjugate_gradients_solver.h
@@ -82,6 +82,8 @@
int residual_reset_period = 10;
double r_tolerance = 0.0;
double q_tolerance = 0.0;
+ ContextImpl* context = nullptr;
+ int num_threads = 1;
};
// This function implements the now classical Conjugate Gradients algorithm of
@@ -125,9 +127,9 @@
summary.message = "Maximum number of iterations reached.";
summary.num_iterations = 0;
- const double norm_rhs = Norm(rhs);
+ const double norm_rhs = Norm(rhs, options.context, options.num_threads);
if (norm_rhs == 0.0) {
- SetZero(solution);
+ SetZero(solution, options.context, options.num_threads);
summary.termination_type = LinearSolverTerminationType::SUCCESS;
summary.message = "Convergence. |b| = 0.";
return summary;
@@ -135,13 +137,13 @@
const double tol_r = options.r_tolerance * norm_rhs;
- SetZero(tmp);
+ SetZero(tmp, options.context, options.num_threads);
lhs.RightMultiplyAndAccumulate(solution, tmp);
// r = rhs - tmp
- Axpby(1.0, rhs, -1.0, tmp, r);
+ Axpby(1.0, rhs, -1.0, tmp, r, options.context, options.num_threads);
- double norm_r = Norm(r);
+ double norm_r = Norm(r, options.context, options.num_threads);
if (options.min_num_iterations == 0 && norm_r <= tol_r) {
summary.termination_type = LinearSolverTerminationType::SUCCESS;
summary.message =
@@ -153,16 +155,16 @@
// Initial value of the quadratic model Q = x'Ax - 2 * b'x.
// double Q0 = -1.0 * solution.dot(rhs + r);
- Axpby(1.0, rhs, 1.0, r, tmp);
- double Q0 = -Dot(solution, tmp);
+ Axpby(1.0, rhs, 1.0, r, tmp, options.context, options.num_threads);
+ double Q0 = -Dot(solution, tmp, options.context, options.num_threads);
for (summary.num_iterations = 1;; ++summary.num_iterations) {
- SetZero(z);
+ SetZero(z, options.context, options.num_threads);
preconditioner.RightMultiplyAndAccumulate(r, z);
const double last_rho = rho;
// rho = r.dot(z);
- rho = Dot(r, z);
+ rho = Dot(r, z, options.context, options.num_threads);
if (IsZeroOrInfinity(rho)) {
summary.termination_type = LinearSolverTerminationType::FAILURE;
summary.message = StringPrintf("Numerical failure. rho = r'z = %e.", rho);
@@ -170,7 +172,7 @@
}
if (summary.num_iterations == 1) {
- Copy(z, p);
+ Copy(z, p, options.context, options.num_threads);
} else {
const double beta = rho / last_rho;
if (IsZeroOrInfinity(beta)) {
@@ -184,21 +186,21 @@
break;
}
// p = z + beta * p;
- Axpby(1.0, z, beta, p, p);
+ Axpby(1.0, z, beta, p, p, options.context, options.num_threads);
}
DenseVectorType& q = z;
- SetZero(q);
+ SetZero(q, options.context, options.num_threads);
lhs.RightMultiplyAndAccumulate(p, q);
- const double pq = Dot(p, q);
+ const double pq = Dot(p, q, options.context, options.num_threads);
if ((pq <= 0) || std::isinf(pq)) {
summary.termination_type = LinearSolverTerminationType::NO_CONVERGENCE;
summary.message = StringPrintf(
"Matrix is indefinite, no more progress can be made. "
"p'q = %e. |p| = %e, |q| = %e",
pq,
- Norm(p),
- Norm(q));
+ Norm(p, options.context, options.num_threads),
+ Norm(q, options.context, options.num_threads));
break;
}
@@ -214,7 +216,13 @@
}
// solution = solution + alpha * p;
- Axpby(1.0, solution, alpha, p, solution);
+ Axpby(1.0,
+ solution,
+ alpha,
+ p,
+ solution,
+ options.context,
+ options.num_threads);
// Ideally we would just use the update r = r - alpha*q to keep
// track of the residual vector. However this estimate tends to
@@ -224,20 +232,20 @@
// requires an additional matrix vector multiply which would
// double the complexity of the CG algorithm.
if (summary.num_iterations % options.residual_reset_period == 0) {
- SetZero(tmp);
+ SetZero(tmp, options.context, options.num_threads);
lhs.RightMultiplyAndAccumulate(solution, tmp);
- Axpby(1.0, rhs, -1.0, tmp, r);
+ Axpby(1.0, rhs, -1.0, tmp, r, options.context, options.num_threads);
// r = rhs - tmp;
} else {
- Axpby(1.0, r, -alpha, q, r);
+ Axpby(1.0, r, -alpha, q, r, options.context, options.num_threads);
// r = r - alpha * q;
}
// Quadratic model based termination.
// Q1 = x'Ax - 2 * b' x.
// const double Q1 = -1.0 * solution.dot(rhs + r);
- Axpby(1.0, rhs, 1.0, r, tmp);
- const double Q1 = -Dot(solution, tmp);
+ Axpby(1.0, rhs, 1.0, r, tmp, options.context, options.num_threads);
+ const double Q1 = -Dot(solution, tmp, options.context, options.num_threads);
// For PSD matrices A, let
//
@@ -270,13 +278,13 @@
summary.num_iterations,
zeta,
options.q_tolerance,
- Norm(r));
+ Norm(r, options.context, options.num_threads));
break;
}
Q0 = Q1;
// Residual based termination.
- norm_r = Norm(r);
+ norm_r = Norm(r, options.context, options.num_threads);
if (norm_r <= tol_r &&
summary.num_iterations >= options.min_num_iterations) {
summary.termination_type = LinearSolverTerminationType::SUCCESS;
diff --git a/internal/ceres/cuda_vector.h b/internal/ceres/cuda_vector.h
index 985eac0..34f3947 100644
--- a/internal/ceres/cuda_vector.h
+++ b/internal/ceres/cuda_vector.h
@@ -121,13 +121,31 @@
// Blas1 operations on Cuda vectors. These functions are needed as an
// abstraction layer so that we can use different versions of a vector style
// object in the conjugate gradients linear solver.
-inline double Norm(const CudaVector& x) { return x.Norm(); }
-inline void SetZero(CudaVector& x) { x.SetZero(); }
+// Context and num_threads arguments are not used by CUDA implementation,
+// context embedded into CudaVector is used instead.
+inline double Norm(const CudaVector& x,
+ ContextImpl* context = nullptr,
+ int num_threads = 1) {
+ (void)context;
+ (void)num_threads;
+ return x.Norm();
+}
+inline void SetZero(CudaVector& x,
+ ContextImpl* context = nullptr,
+ int num_threads = 1) {
+ (void)context;
+ (void)num_threads;
+ x.SetZero();
+}
inline void Axpby(double a,
const CudaVector& x,
double b,
const CudaVector& y,
- CudaVector& z) {
+ CudaVector& z,
+ ContextImpl* context = nullptr,
+ int num_threads = 1) {
+ (void)context;
+ (void)num_threads;
if (&x == &y && &y == &z) {
// z = (a + b) * z;
z.Scale(a + b);
@@ -146,8 +164,22 @@
z.Axpby(a, x, b);
}
}
-inline double Dot(const CudaVector& x, const CudaVector& y) { return x.Dot(y); }
-inline void Copy(const CudaVector& from, CudaVector& to) { to = from; }
+inline double Dot(const CudaVector& x,
+ const CudaVector& y,
+ ContextImpl* context = nullptr,
+ int num_threads = 1) {
+ (void)context;
+ (void)num_threads;
+ return x.Dot(y);
+}
+inline void Copy(const CudaVector& from,
+ CudaVector& to,
+ ContextImpl* context = nullptr,
+ int num_threads = 1) {
+ (void)context;
+ (void)num_threads;
+ to = from;
+}
} // namespace ceres::internal
diff --git a/internal/ceres/eigen_vector_ops.h b/internal/ceres/eigen_vector_ops.h
index 5bcf49d..91f2946 100644
--- a/internal/ceres/eigen_vector_ops.h
+++ b/internal/ceres/eigen_vector_ops.h
@@ -31,21 +31,65 @@
#ifndef CERES_INTERNAL_EIGEN_VECTOR_OPS_H_
#define CERES_INTERNAL_EIGEN_VECTOR_OPS_H_
+#include <numeric>
+
#include "ceres/internal/eigen.h"
+#include "ceres/parallel_for.h"
namespace ceres::internal {
// Blas1 operations on Eigen vectors. These functions are needed as an
// abstraction layer so that we can use different versions of a vector style
// object in the conjugate gradients linear solver.
-inline double Norm(const Vector& x) { return x.norm(); }
-inline void SetZero(Vector& x) { x.setZero(); }
-inline void Axpby(
- double a, const Vector& x, double b, const Vector& y, Vector& z) {
- z = a * x + b * y;
+inline double Norm(const Vector& x, ContextImpl* context, int num_threads) {
+ std::vector<double> norms(num_threads);
+ ParallelFor(context,
+ 0,
+ x.rows(),
+ num_threads,
+ [&x, &norms](int thread_id, std::tuple<int, int> range) {
+ auto [start, end] = range;
+ norms[thread_id] += x.segment(start, end - start).squaredNorm();
+ });
+ return std::sqrt(std::accumulate(norms.begin(), norms.end(), 0.));
}
-inline double Dot(const Vector& x, const Vector& y) { return x.dot(y); }
-inline void Copy(const Vector& from, Vector& to) { to = from; }
+inline void SetZero(Vector& x, ContextImpl* context, int num_threads) {
+ ParallelSetZero(context, num_threads, x);
+}
+inline void Axpby(double a,
+ const Vector& x,
+ double b,
+ const Vector& y,
+ Vector& z,
+ ContextImpl* context,
+ int num_threads) {
+ ParallelAssign(context, num_threads, z, a * x + b * y);
+}
+template <typename VectorLikeX, typename VectorLikeY>
+inline double Dot(const VectorLikeX& x,
+ const VectorLikeY& y,
+ ContextImpl* context,
+ int num_threads) {
+ std::vector<double> dots(num_threads);
+ ParallelFor(context,
+ 0,
+ x.rows(),
+ num_threads,
+ [&x, &y, &dots](int thread_id, std::tuple<int, int> range) {
+ auto [start, end] = range;
+ const int block_size = end - start;
+ const auto& x_block = x.segment(start, block_size);
+ const auto& y_block = y.segment(start, block_size);
+ dots[thread_id] += x_block.dot(y_block);
+ });
+ return std::accumulate(dots.begin(), dots.end(), 0.);
+}
+inline void Copy(const Vector& from,
+ Vector& to,
+ ContextImpl* context,
+ int num_threads) {
+ ParallelAssign(context, num_threads, to, from);
+}
} // namespace ceres::internal
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index 6f0591e..be784c2 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc
@@ -39,6 +39,7 @@
#include "ceres/cuda_sparse_matrix.h"
#include "ceres/cuda_vector.h"
#include "ceres/evaluator.h"
+#include "ceres/implicit_schur_complement.h"
#include "ceres/partitioned_matrix_view.h"
#include "ceres/preprocessor.h"
#include "ceres/problem.h"
@@ -76,6 +77,21 @@
parameters.resize(program->NumParameters());
program->ParameterBlocksToStateVector(parameters.data());
+
+ const int num_residuals = program->NumResiduals();
+ b.resize(num_residuals);
+
+ std::mt19937 rng;
+ std::normal_distribution<double> rnorm;
+ for (int i = 0; i < num_residuals; ++i) {
+ b[i] = rnorm(rng);
+ }
+
+ const int num_parameters = program->NumParameters();
+ D.resize(num_parameters);
+ for (int i = 0; i < num_parameters; ++i) {
+ D[i] = rnorm(rng);
+ }
}
std::unique_ptr<BlockSparseMatrix> CreateBlockSparseJacobian(
@@ -128,15 +144,6 @@
return block_sparse_jacobian.get();
}
- const BlockSparseMatrix* BlockSparseJacobianWithTranspose(
- ContextImpl* context) {
- if (!block_sparse_jacobian_with_transpose) {
- block_sparse_jacobian_with_transpose = CreateBlockSparseJacobian(context);
- block_sparse_jacobian_with_transpose->AddTransposeBlockStructure();
- }
- return block_sparse_jacobian_with_transpose.get();
- }
-
const CompressedRowSparseMatrix* CompressedRowSparseJacobian(
ContextImpl* context) {
if (!crs_jacobian) {
@@ -147,14 +154,13 @@
std::unique_ptr<PartitionedView> PartitionedMatrixViewJacobian(
const LinearSolver::Options& options) {
- auto block_sparse = BlockSparseJacobianWithTranspose(options.context);
+ auto block_sparse = BlockSparseJacobian(options.context);
return std::make_unique<PartitionedView>(options, *block_sparse);
}
BlockSparseMatrix* BlockDiagonalEtE(const LinearSolver::Options& options) {
if (!block_diagonal_ete) {
- auto partitioned_view =
- PartitionedMatrixViewJacobian(options);
+ auto partitioned_view = PartitionedMatrixViewJacobian(options);
block_diagonal_ete = partitioned_view->CreateBlockDiagonalEtE();
}
return block_diagonal_ete.get();
@@ -162,21 +168,41 @@
BlockSparseMatrix* BlockDiagonalFtF(const LinearSolver::Options& options) {
if (!block_diagonal_ftf) {
- auto partitioned_view =
- PartitionedMatrixViewJacobian(options);
+ auto partitioned_view = PartitionedMatrixViewJacobian(options);
block_diagonal_ftf = partitioned_view->CreateBlockDiagonalFtF();
}
return block_diagonal_ftf.get();
}
+ const ImplicitSchurComplement* ImplicitSchurComplementWithoutDiagonal(
+ const LinearSolver::Options& options) {
+ auto block_sparse = BlockSparseJacobian(options.context);
+ implicit_schur_complement =
+ std::make_unique<ImplicitSchurComplement>(options);
+ implicit_schur_complement->Init(*block_sparse, nullptr, b.data());
+ return implicit_schur_complement.get();
+ }
+
+ const ImplicitSchurComplement* ImplicitSchurComplementWithDiagonal(
+ const LinearSolver::Options& options) {
+ auto block_sparse = BlockSparseJacobian(options.context);
+ implicit_schur_complement_diag =
+ std::make_unique<ImplicitSchurComplement>(options);
+ implicit_schur_complement_diag->Init(*block_sparse, D.data(), b.data());
+ return implicit_schur_complement_diag.get();
+ }
+
Vector parameters;
+ Vector D;
+ Vector b;
std::unique_ptr<BundleAdjustmentProblem> bal_problem;
std::unique_ptr<PreprocessedProblem> preprocessed_problem;
std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian;
- std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian_with_transpose;
std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian;
std::unique_ptr<BlockSparseMatrix> block_diagonal_ete;
std::unique_ptr<BlockSparseMatrix> block_diagonal_ftf;
+ std::unique_ptr<ImplicitSchurComplement> implicit_schur_complement;
+ std::unique_ptr<ImplicitSchurComplement> implicit_schur_complement_diag;
};
static void Residuals(benchmark::State& state,
@@ -244,6 +270,32 @@
}
}
+static void Plus(benchmark::State& state, BALData* data, ContextImpl* context) {
+ const int num_threads = state.range(0);
+
+ Evaluator::Options options;
+ options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
+ options.num_threads = num_threads;
+ options.context = context;
+ options.num_eliminate_blocks = 0;
+
+ std::string error;
+ CHECK(data->preprocessed_problem != nullptr);
+ auto program = data->preprocessed_problem->reduced_program.get();
+ CHECK(program != nullptr);
+ auto evaluator = Evaluator::Create(options, program, &error);
+ CHECK(evaluator != nullptr);
+
+ Vector state_plus_delta = Vector::Zero(program->NumParameters());
+ Vector delta = Vector::Random(program->NumEffectiveParameters());
+
+ for (auto _ : state) {
+ CHECK(evaluator->Plus(
+ data->parameters.data(), delta.data(), state_plus_delta.data()));
+ }
+ CHECK_GT(state_plus_delta.squaredNorm(), 0.);
+}
+
static void PMVRightMultiplyAndAccumulateF(benchmark::State& state,
BALData* data,
ContextImpl* context) {
@@ -346,6 +398,71 @@
}
}
+static void ISCRightMultiplyNoDiag(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ LinearSolver::Options options;
+ options.num_threads = state.range(0);
+ options.elimination_groups.push_back(data->bal_problem->num_points());
+ options.context = context;
+ auto jacobian = data->ImplicitSchurComplementWithoutDiagonal(options);
+
+ Vector y = Vector::Zero(jacobian->num_rows());
+ Vector x = Vector::Random(jacobian->num_cols());
+ for (auto _ : state) {
+ jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
+ }
+ CHECK_GT(y.squaredNorm(), 0.);
+}
+
+static void ISCRightMultiplyDiag(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ LinearSolver::Options options;
+ options.num_threads = state.range(0);
+ options.elimination_groups.push_back(data->bal_problem->num_points());
+ options.context = context;
+
+ auto jacobian = data->ImplicitSchurComplementWithDiagonal(options);
+
+ Vector y = Vector::Zero(jacobian->num_rows());
+ Vector x = Vector::Random(jacobian->num_cols());
+ for (auto _ : state) {
+ jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
+ }
+ CHECK_GT(y.squaredNorm(), 0.);
+}
+
+static void JacobianSquaredColumnNorm(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ const int num_threads = state.range(0);
+
+ auto jacobian = data->BlockSparseJacobian(context);
+
+ Vector x = Vector::Zero(jacobian->num_cols());
+
+ for (auto _ : state) {
+ jacobian->SquaredColumnNorm(x.data(), context, num_threads);
+ }
+ CHECK_GT(x.squaredNorm(), 0.);
+}
+
+static void JacobianScaleColumns(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ const int num_threads = state.range(0);
+
+ auto jacobian_const = data->BlockSparseJacobian(context);
+ auto jacobian = const_cast<BlockSparseMatrix*>(jacobian_const);
+
+ Vector x = Vector::Ones(jacobian->num_cols());
+
+ for (auto _ : state) {
+ jacobian->ScaleColumns(x.data(), context, num_threads);
+ }
+}
+
static void JacobianRightMultiplyAndAccumulate(benchmark::State& state,
BALData* data,
ContextImpl* context) {
@@ -368,7 +485,7 @@
ContextImpl* context) {
const int num_threads = state.range(0);
- auto jacobian = data->BlockSparseJacobianWithTranspose(context);
+ auto jacobian = data->BlockSparseJacobian(context);
Vector y = Vector::Zero(jacobian->num_cols());
Vector x = Vector::Random(jacobian->num_rows());
@@ -483,6 +600,15 @@
->Arg(8)
->Arg(16);
+ const std::string name_plus = "Plus<" + path + ">";
+ ::benchmark::RegisterBenchmark(
+ name_plus.c_str(), ceres::internal::Plus, data, &context)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
+
const std::string name_right_product =
"JacobianRightMultiplyAndAccumulate<" + path + ">";
::benchmark::RegisterBenchmark(
@@ -534,6 +660,18 @@
->Arg(8)
->Arg(16);
+ const std::string name_isc_no_diag =
+ "ISCRightMultiplyAndAccumulate<" + path + ">";
+ ::benchmark::RegisterBenchmark(name_isc_no_diag.c_str(),
+ ceres::internal::ISCRightMultiplyNoDiag,
+ data,
+ &context)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
+
const std::string name_update_block_diagonal_ete =
"PMVUpdateBlockDiagonalEtE<" + path + ">";
::benchmark::RegisterBenchmark(name_update_block_diagonal_ete.c_str(),
@@ -545,6 +683,17 @@
->Arg(4)
->Arg(8)
->Arg(16);
+ const std::string name_isc_diag =
+ "ISCRightMultiplyAndAccumulateDiag<" + path + ">";
+ ::benchmark::RegisterBenchmark(name_isc_diag.c_str(),
+ ceres::internal::ISCRightMultiplyDiag,
+ data,
+ &context)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
#ifndef CERES_NO_CUDA
const std::string name_right_product_cuda =
@@ -606,6 +755,29 @@
&context)
->Arg(1);
#endif
+
+ const std::string name_squared_column_norm =
+ "JacobianSquaredColumnNorm<" + path + ">";
+ ::benchmark::RegisterBenchmark(name_squared_column_norm.c_str(),
+ ceres::internal::JacobianSquaredColumnNorm,
+ data,
+ &context)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
+
+ const std::string name_scale_columns = "JacobianScaleColumns<" + path + ">";
+ ::benchmark::RegisterBenchmark(name_scale_columns.c_str(),
+ ceres::internal::JacobianScaleColumns,
+ data,
+ &context)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
}
::benchmark::RunSpecifiedBenchmarks();
diff --git a/internal/ceres/implicit_schur_complement.cc b/internal/ceres/implicit_schur_complement.cc
index 4e36048..dbba3a7 100644
--- a/internal/ceres/implicit_schur_complement.cc
+++ b/internal/ceres/implicit_schur_complement.cc
@@ -35,6 +35,7 @@
#include "ceres/block_structure.h"
#include "ceres/internal/eigen.h"
#include "ceres/linear_solver.h"
+#include "ceres/parallel_for.h"
#include "ceres/types.h"
#include "glog/logging.h"
@@ -104,21 +105,22 @@
void ImplicitSchurComplement::RightMultiplyAndAccumulate(const double* x,
double* y) const {
// y1 = F x
- tmp_rows_.setZero();
+ ParallelSetZero(options_.context, options_.num_threads, tmp_rows_);
A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());
// y2 = E' y1
- tmp_e_cols_.setZero();
+ ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_);
A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());
// y3 = -(E'E)^-1 y2
- tmp_e_cols_2_.setZero();
+ ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_2_);
block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
tmp_e_cols_2_.data(),
options_.context,
options_.num_threads);
- tmp_e_cols_2_ *= -1.0;
+ ParallelAssign(
+ options_.context, options_.num_threads, tmp_e_cols_2_, -tmp_e_cols_2_);
// y1 = y1 + E y3
A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data());
@@ -126,11 +128,14 @@
// y5 = D * x
if (D_ != nullptr) {
ConstVectorRef Dref(D_ + A_->num_cols_e(), num_cols());
- VectorRef(y, num_cols()) =
- (Dref.array().square() * ConstVectorRef(x, num_cols()).array())
- .matrix();
+ VectorRef y_cols(y, num_cols());
+ ParallelAssign(
+ options_.context,
+ options_.num_threads,
+ y_cols,
+ (Dref.array().square() * ConstVectorRef(x, num_cols()).array()));
} else {
- VectorRef(y, num_cols()).setZero();
+ ParallelSetZero(options_.context, options_.num_threads, y, num_cols());
}
// y = y5 + F' y1
@@ -141,25 +146,25 @@
const double* x, double* y) const {
CHECK(compute_ftf_inverse_);
// y1 = F x
- tmp_rows_.setZero();
+ ParallelSetZero(options_.context, options_.num_threads, tmp_rows_);
A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());
// y2 = E' y1
- tmp_e_cols_.setZero();
+ ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_);
A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());
// y3 = (E'E)^-1 y2
- tmp_e_cols_2_.setZero();
+ ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_2_);
block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
tmp_e_cols_2_.data(),
options_.context,
options_.num_threads);
// y1 = E y3
- tmp_rows_.setZero();
+ ParallelSetZero(options_.context, options_.num_threads, tmp_rows_);
A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data());
// y4 = F' y1
- tmp_f_cols_.setZero();
+ ParallelSetZero(options_.context, options_.num_threads, tmp_f_cols_);
A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), tmp_f_cols_.data());
// y += (F'F)^-1 y4
@@ -174,22 +179,27 @@
const double* D, BlockSparseMatrix* block_diagonal) {
const CompressedRowBlockStructure* block_diagonal_structure =
block_diagonal->block_structure();
- for (const auto& row : block_diagonal_structure->rows) {
- const int row_block_pos = row.block.position;
- const int row_block_size = row.block.size;
- const Cell& cell = row.cells[0];
- MatrixRef m(block_diagonal->mutable_values() + cell.position,
- row_block_size,
- row_block_size);
+ ParallelFor(options_.context,
+ 0,
+ block_diagonal_structure->rows.size(),
+ options_.num_threads,
+ [block_diagonal_structure, D, block_diagonal](int row_block_id) {
+ auto& row = block_diagonal_structure->rows[row_block_id];
+ const int row_block_pos = row.block.position;
+ const int row_block_size = row.block.size;
+ const Cell& cell = row.cells[0];
+ MatrixRef m(block_diagonal->mutable_values() + cell.position,
+ row_block_size,
+ row_block_size);
- if (D != nullptr) {
- ConstVectorRef d(D + row_block_pos, row_block_size);
- m += d.array().square().matrix().asDiagonal();
- }
+ if (D != nullptr) {
+ ConstVectorRef d(D + row_block_pos, row_block_size);
+ m += d.array().square().matrix().asDiagonal();
+ }
- m = m.selfadjointView<Eigen::Upper>().llt().solve(
- Matrix::Identity(row_block_size, row_block_size));
- }
+ m = m.selfadjointView<Eigen::Upper>().llt().solve(
+ Matrix::Identity(row_block_size, row_block_size));
+ });
}
// Similar to RightMultiplyAndAccumulate, use the block structure of the matrix
@@ -201,18 +211,21 @@
const int num_rows = A_->num_rows();
// y1 = F x
- tmp_rows_.setZero();
+ ParallelSetZero(options_.context, options_.num_threads, tmp_rows_);
A_->RightMultiplyAndAccumulateF(x, tmp_rows_.data());
// y2 = b - y1
- tmp_rows_ = ConstVectorRef(b_, num_rows) - tmp_rows_;
+ ParallelAssign(options_.context,
+ options_.num_threads,
+ tmp_rows_,
+ ConstVectorRef(b_, num_rows) - tmp_rows_);
// y3 = E' y2
- tmp_e_cols_.setZero();
+ ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_);
A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());
// y = (E'E)^-1 y3
- VectorRef(y, num_cols).setZero();
+ ParallelSetZero(options_.context, options_.num_threads, y, num_cols);
block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(
tmp_e_cols_.data(), y, options_.context, options_.num_threads);
@@ -221,7 +234,11 @@
// computed via back substitution. The second block of variables
// corresponds to the Schur complement system, so we just copy those
// values from the solution to the Schur complement.
- VectorRef(y + num_cols_e, num_cols_f) = ConstVectorRef(x, num_cols_f);
+ VectorRef y_cols_f(y + num_cols_e, num_cols_f);
+ ParallelAssign(options_.context,
+ options_.num_threads,
+ y_cols_f,
+ ConstVectorRef(x, num_cols_f));
}
// Compute the RHS of the Schur complement system.
@@ -232,23 +249,28 @@
// this using a series of matrix vector products.
void ImplicitSchurComplement::UpdateRhs() {
// y1 = E'b
- tmp_e_cols_.setZero();
+ ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_);
A_->LeftMultiplyAndAccumulateE(b_, tmp_e_cols_.data());
// y2 = (E'E)^-1 y1
- Vector y2 = Vector::Zero(A_->num_cols_e());
- block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(
- tmp_e_cols_.data(), y2.data(), options_.context, options_.num_threads);
+ ParallelSetZero(options_.context, options_.num_threads, tmp_e_cols_2_);
+ block_diagonal_EtE_inverse_->RightMultiplyAndAccumulate(tmp_e_cols_.data(),
+ tmp_e_cols_2_.data(),
+ options_.context,
+ options_.num_threads);
// y3 = E y2
- tmp_rows_.setZero();
- A_->RightMultiplyAndAccumulateE(y2.data(), tmp_rows_.data());
+ ParallelSetZero(options_.context, options_.num_threads, tmp_rows_);
+ A_->RightMultiplyAndAccumulateE(tmp_e_cols_2_.data(), tmp_rows_.data());
// y3 = b - y3
- tmp_rows_ = ConstVectorRef(b_, A_->num_rows()) - tmp_rows_;
+ ParallelAssign(options_.context,
+ options_.num_threads,
+ tmp_rows_,
+ ConstVectorRef(b_, A_->num_rows()) - tmp_rows_);
// rhs = F' y3
- rhs_.setZero();
+ ParallelSetZero(options_.context, options_.num_threads, rhs_);
A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), rhs_.data());
}
diff --git a/internal/ceres/iterative_schur_complement_solver.cc b/internal/ceres/iterative_schur_complement_solver.cc
index 589a1d2..16b20a7 100644
--- a/internal/ceres/iterative_schur_complement_solver.cc
+++ b/internal/ceres/iterative_schur_complement_solver.cc
@@ -130,7 +130,7 @@
Vector scratch[4];
for (int i = 0; i < 4; ++i) {
- scratch[i] = Vector::Zero(schur_complement_->num_cols());
+ scratch[i].resize(schur_complement_->num_cols());
}
Vector* scratch_ptr[4] = {&scratch[0], &scratch[1], &scratch[2], &scratch[3]};
@@ -180,15 +180,23 @@
break;
case JACOBI:
preconditioner_ = std::make_unique<SparseMatrixPreconditionerWrapper>(
- schur_complement_->block_diagonal_FtF_inverse());
+ schur_complement_->block_diagonal_FtF_inverse(),
+ preconditioner_options);
break;
case SCHUR_POWER_SERIES_EXPANSION:
// Ignoring the value of spse_tolerance to ensure preconditioner stays
// fixed during the iterations of cg.
+ // TODO: In PowerSeriesExpansionPreconditioner::RightMultiplyAndAccumulate
+ // only operations performed via ImplicitSchurComplement are threaded.
+ // PowerSeriesExpansionPreconditioner will benefit from multithreading
+ // applied to remaning operations (block-sparse right product and several
+ // vector operations)
preconditioner_ = std::make_unique<PowerSeriesExpansionPreconditioner>(
schur_complement_.get(), options_.max_num_spse_iterations, 0);
break;
case SCHUR_JACOBI:
+ // TODO: SchurJacobiPreconditioner::RightMultiply will benefit from
+ // multithreading
preconditioner_ = std::make_unique<SchurJacobiPreconditioner>(
*A->block_structure(), preconditioner_options);
break;
diff --git a/internal/ceres/levenberg_marquardt_strategy.cc b/internal/ceres/levenberg_marquardt_strategy.cc
index b32e131..76e73b4 100644
--- a/internal/ceres/levenberg_marquardt_strategy.cc
+++ b/internal/ceres/levenberg_marquardt_strategy.cc
@@ -38,6 +38,7 @@
#include "ceres/internal/eigen.h"
#include "ceres/linear_least_squares_problems.h"
#include "ceres/linear_solver.h"
+#include "ceres/parallel_for.h"
#include "ceres/sparse_matrix.h"
#include "ceres/trust_region_strategy.h"
#include "ceres/types.h"
@@ -53,7 +54,9 @@
min_diagonal_(options.min_lm_diagonal),
max_diagonal_(options.max_lm_diagonal),
decrease_factor_(2.0),
- reuse_diagonal_(false) {
+ reuse_diagonal_(false),
+ context_(options.context),
+ num_threads_(options.num_threads) {
CHECK(linear_solver_ != nullptr);
CHECK_GT(min_diagonal_, 0.0);
CHECK_LE(min_diagonal_, max_diagonal_);
@@ -77,14 +80,18 @@
diagonal_.resize(num_parameters, 1);
}
- jacobian->SquaredColumnNorm(diagonal_.data());
- for (int i = 0; i < num_parameters; ++i) {
- diagonal_[i] =
- std::min(std::max(diagonal_[i], min_diagonal_), max_diagonal_);
- }
+ jacobian->SquaredColumnNorm(diagonal_.data(), context_, num_threads_);
+ ParallelAssign(context_,
+ num_threads_,
+ diagonal_,
+ diagonal_.array().max(min_diagonal_).min(max_diagonal_));
}
- lm_diagonal_ = (diagonal_ / radius_).array().sqrt();
+ if (lm_diagonal_.size() == 0) {
+ lm_diagonal_.resize(num_parameters);
+ }
+ ParallelAssign(
+ context_, num_threads_, lm_diagonal_, (diagonal_ / radius_).cwiseSqrt());
LinearSolver::PerSolveOptions solve_options;
solve_options.D = lm_diagonal_.data();
@@ -98,7 +105,7 @@
// Invalidate the output array lm_step, so that we can detect if
// the linear solver generated numerical garbage. This is known
// to happen for the DENSE_QR and then DENSE_SCHUR solver when
- // the Jacobin is severely rank deficient and mu is too small.
+ // the Jacobian is severely rank deficient and mu is too small.
InvalidateArray(num_parameters, step);
// Instead of solving Jx = -r, solve Jy = r.
@@ -120,7 +127,8 @@
linear_solver_summary.termination_type =
LinearSolverTerminationType::FAILURE;
} else {
- VectorRef(step, num_parameters) *= -1.0;
+ VectorRef step_vec(step, num_parameters);
+ ParallelAssign(context_, num_threads_, step_vec, -step_vec);
}
reuse_diagonal_ = true;
diff --git a/internal/ceres/levenberg_marquardt_strategy.h b/internal/ceres/levenberg_marquardt_strategy.h
index 2b014a6..bc6aae3 100644
--- a/internal/ceres/levenberg_marquardt_strategy.h
+++ b/internal/ceres/levenberg_marquardt_strategy.h
@@ -38,6 +38,8 @@
namespace ceres::internal {
+class ContextImpl;
+
// Levenberg-Marquardt step computation and trust region sizing
// strategy based on on "Methods for Nonlinear Least Squares" by
// K. Madsen, H.B. Nielsen and O. Tingleff. Available to download from
@@ -81,6 +83,8 @@
// allocations in every iteration and reuse when a step fails and
// ComputeStep is called again.
Vector lm_diagonal_; // lm_diagonal_ = sqrt(diagonal_ / radius_);
+ ContextImpl* context_;
+ int num_threads_;
};
} // namespace ceres::internal
diff --git a/internal/ceres/minimizer.h b/internal/ceres/minimizer.h
index d4ba708..b7a4df7 100644
--- a/internal/ceres/minimizer.h
+++ b/internal/ceres/minimizer.h
@@ -47,6 +47,7 @@
class TrustRegionStrategy;
class CoordinateDescentMinimizer;
class LinearSolver;
+class ContextImpl;
// Interface for non-linear least squares solvers.
class CERES_NO_EXPORT Minimizer {
@@ -113,6 +114,7 @@
int max_num_iterations;
double max_solver_time_in_seconds;
int num_threads;
+ ContextImpl* context = nullptr;
// Number of times the linear solver should be retried in case of
// numerical failure. The retries are done by exponentially scaling up
diff --git a/internal/ceres/parallel_for.h b/internal/ceres/parallel_for.h
index d91a195..26a4a79 100644
--- a/internal/ceres/parallel_for.h
+++ b/internal/ceres/parallel_for.h
@@ -38,6 +38,7 @@
#include "ceres/context_impl.h"
#include "ceres/internal/disable_warnings.h"
+#include "ceres/internal/eigen.h"
#include "ceres/internal/export.h"
#include "glog/logging.h"
@@ -61,7 +62,7 @@
namespace parallel_for_details {
// Get arguments of callable as a tuple
template <typename F, typename... Args>
-std::tuple<Args...> args_of(void (F::*)(Args...) const);
+std::tuple<std::decay_t<Args>...> args_of(void (F::*)(Args...) const);
template <typename F>
using args_of_t = decltype(args_of(&F::operator()));
@@ -75,24 +76,58 @@
// For parallel for iterations of type [](int i) -> void
template <typename F>
struct InvokeImpl<F, std::tuple<int>> {
- static void Invoke(int thread_id, int i, const F& function) {
+ static void InvokeOnSegment(int thread_id,
+ int start,
+ int end,
+ const F& function) {
(void)thread_id;
- function(i);
+ for (int i = start; i < end; ++i) {
+ function(i);
+ }
}
};
// For parallel for iterations of type [](int thread_id, int i) -> void
template <typename F>
struct InvokeImpl<F, std::tuple<int, int>> {
- static void Invoke(int thread_id, int i, const F& function) {
- function(thread_id, i);
+ static void InvokeOnSegment(int thread_id,
+ int start,
+ int end,
+ const F& function) {
+ for (int i = start; i < end; ++i) {
+ function(thread_id, i);
+ }
+ }
+};
+
+// For parallel for iterations of type [](tuple<int, int> range) -> void
+template <typename F>
+struct InvokeImpl<F, std::tuple<std::tuple<int, int>>> {
+ static void InvokeOnSegment(int thread_id,
+ int start,
+ int end,
+ const F& function) {
+ (void)thread_id;
+ function(std::make_tuple(start, end));
+ }
+};
+
+// For parallel for iterations of type [](int thread_id, tuple<int, int> range)
+// -> void
+template <typename F>
+struct InvokeImpl<F, std::tuple<int, std::tuple<int, int>>> {
+ static void InvokeOnSegment(int thread_id,
+ int start,
+ int end,
+ const F& function) {
+ function(thread_id, std::make_tuple(start, end));
}
};
// Invoke function passing thread_id only if required
template <typename F>
-void Invoke(int thread_id, int i, const F& function) {
- InvokeImpl<F, args_of_t<F>>::Invoke(thread_id, i, function);
+void InvokeOnSegment(int thread_id, int start, int end, const F& function) {
+ InvokeImpl<F, args_of_t<F>>::InvokeOnSegment(thread_id, start, end, function);
}
// Check if it is possible to split range [start; end) into at most
@@ -218,7 +253,8 @@
// implemented by each threading backend
template <typename F>
void ParallelInvoke(ContextImpl* context,
- int i,
+ int start,
+ int end,
int num_threads,
const F& function);
@@ -241,9 +277,7 @@
}
if (num_threads == 1 || end - start == 1) {
- for (int i = start; i < end; ++i) {
- Invoke<F>(0, i, function);
- }
+ InvokeOnSegment<F>(0, start, end, function);
return;
}
@@ -293,9 +327,8 @@
const int partition_start = partitions[partition_id];
const int partition_end = partitions[partition_id + 1];
- for (int i = partition_start; i < partition_end; ++i) {
- Invoke<F>(thread_id, i, function);
- }
+ InvokeOnSegment<F>(
+ thread_id, partition_start, partition_end, function);
});
}
@@ -330,11 +363,50 @@
const int partition_start = partitions[partition_id];
const int partition_end = partitions[partition_id + 1];
- for (int i = partition_start; i < partition_end; ++i) {
- Invoke<F>(thread_id, i, function);
- }
+ InvokeOnSegment<F>(
+ thread_id, partition_start, partition_end, function);
});
}
+
+// Evaluate vector expression in parallel
+// Assuming LhsExpression and RhsExpression are some sort of
+// column-vector expression, assignment lhs = rhs
+// is eavluated over a set of contiguous blocks in parallel.
+// This is expected to work well in the case of vector-based
+// expressions (since they typically do not result into
+// temporaries).
+// This method expects lhs to be size-compatible with rhs
+template <typename LhsExpression, typename RhsExpression>
+void ParallelAssign(ContextImpl* context,
+ int num_threads,
+ LhsExpression& lhs,
+ const RhsExpression& rhs) {
+ static_assert(LhsExpression::ColsAtCompileTime == 1);
+ static_assert(RhsExpression::ColsAtCompileTime == 1);
+ CHECK_EQ(lhs.rows(), rhs.rows());
+ const int num_rows = lhs.rows();
+ ParallelFor(context,
+ 0,
+ num_rows,
+ num_threads,
+ [&lhs, &rhs](const std::tuple<int, int>& range) {
+ auto [start, end] = range;
+ lhs.segment(start, end - start) =
+ rhs.segment(start, end - start);
+ });
+}
+
+// Set vector to zero using num_threads
+template <typename VectorType>
+void ParallelSetZero(ContextImpl* context,
+ int num_threads,
+ VectorType& vector) {
+ ParallelSetZero(context, num_threads, vector.data(), vector.rows());
+}
+void ParallelSetZero(ContextImpl* context,
+ int num_threads,
+ double* values,
+ int num_values);
} // namespace ceres::internal
// Backend-specific implementations of ParallelInvoke
diff --git a/internal/ceres/parallel_for_cxx.cc b/internal/ceres/parallel_for_cxx.cc
index cdf9966..3f47436 100644
--- a/internal/ceres/parallel_for_cxx.cc
+++ b/internal/ceres/parallel_for_cxx.cc
@@ -74,4 +74,18 @@
int MaxNumThreadsAvailable() { return ThreadPool::MaxNumThreadsAvailable(); }
+void ParallelSetZero(ContextImpl* context,
+ int num_threads,
+ double* values,
+ int num_values) {
+ ParallelFor(context,
+ 0,
+ num_values,
+ num_threads,
+ [values](std::tuple<int, int> range) {
+ auto [start, end] = range;
+ std::fill(values + start, values + end, 0.);
+ });
+}
+
} // namespace ceres::internal
diff --git a/internal/ceres/parallel_for_cxx.h b/internal/ceres/parallel_for_cxx.h
index 0731609..c33ae08 100644
--- a/internal/ceres/parallel_for_cxx.h
+++ b/internal/ceres/parallel_for_cxx.h
@@ -213,9 +213,7 @@
const int curr_end = curr_start + base_block_size +
(block_id < num_base_p1_sized_blocks ? 1 : 0);
// Perform each task in current block
- for (int i = curr_start; i < curr_end; ++i) {
- Invoke<F>(thread_id, i, function);
- }
+ InvokeOnSegment<F>(thread_id, curr_start, curr_end, function);
}
shared_state->block_until_finished.Finished(num_jobs_finished);
};
diff --git a/internal/ceres/parallel_for_test.cc b/internal/ceres/parallel_for_test.cc
index 6974ab9..058b0a3 100644
--- a/internal/ceres/parallel_for_test.cc
+++ b/internal/ceres/parallel_for_test.cc
@@ -70,6 +70,28 @@
}
}
+// Tests parallel for loop with ranges
+TEST(ParallelForWithRange, NumThreads) {
+ ContextImpl context;
+ context.EnsureMinimumThreads(/*num_threads=*/2);
+
+ const int size = 16;
+ std::vector<int> expected_results(size, 0);
+ for (int i = 0; i < size; ++i) {
+ expected_results[i] = std::sqrt(i);
+ }
+
+ for (int num_threads = 1; num_threads <= 8; ++num_threads) {
+ std::vector<int> values(size, 0);
+ ParallelFor(
+ &context, 0, size, num_threads, [&values](std::tuple<int, int> range) {
+ auto [start, end] = range;
+ for (int i = start; i < end; ++i) values[i] = std::sqrt(i);
+ });
+ EXPECT_THAT(values, ElementsAreArray(expected_results));
+ }
+}
+
// Tests the parallel for loop with the thread ID interface computes the correct
// result for various number of threads.
TEST(ParallelForWithThreadId, NumThreads) {
@@ -410,4 +432,40 @@
}
}
+TEST(ParallelAssign, D2MulX) {
+ const int kVectorSize = 1024 * 1024;
+ const int kMaxNumThreads = 8;
+
+ const Vector D_full = Vector::Random(kVectorSize * 2);
+ const ConstVectorRef D(D_full.data() + kVectorSize, kVectorSize);
+ const Vector x = Vector::Random(kVectorSize);
+ const Vector y_expected = D.array().square() * x.array();
+ ContextImpl context;
+ context.EnsureMinimumThreads(kMaxNumThreads);
+
+ for (int num_threads = 1; num_threads <= kMaxNumThreads; ++num_threads) {
+ Vector y_observed(kVectorSize);
+ ParallelAssign(
+ &context, num_threads, y_observed, D.array().square() * x.array());
+
+ // Bit-exact result is expected
+ CHECK_EQ((y_expected - y_observed).squaredNorm(), 0.);
+ }
+}
+
+TEST(ParallelAssign, SetZero) {
+ const int kVectorSize = 1024 * 1024;
+ const int kMaxNumThreads = 8;
+
+ ContextImpl context;
+ context.EnsureMinimumThreads(kMaxNumThreads);
+
+ for (int num_threads = 1; num_threads <= kMaxNumThreads; ++num_threads) {
+ Vector x = Vector::Random(kVectorSize);
+ ParallelSetZero(&context, num_threads, x);
+
+ CHECK_EQ(x.squaredNorm(), 0.);
+ }
+}
+
} // namespace ceres::internal
diff --git a/internal/ceres/parallel_vector_operations_benchmark.cc b/internal/ceres/parallel_vector_operations_benchmark.cc
new file mode 100644
index 0000000..7180b54
--- /dev/null
+++ b/internal/ceres/parallel_vector_operations_benchmark.cc
@@ -0,0 +1,282 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+#include "benchmark/benchmark.h"
+#include "ceres/eigen_vector_ops.h"
+#include "ceres/parallel_for.h"
+
+namespace ceres::internal {
+
+const int kVectorSize = 64 * 1024 * 1024 / sizeof(double);
+
+static void SetZero(benchmark::State& state) {
+ Vector x = Vector::Random(kVectorSize);
+ for (auto _ : state) {
+ x.setZero();
+ }
+ CHECK_EQ(x.squaredNorm(), 0.);
+}
+BENCHMARK(SetZero);
+
+static void SetZeroParallel(benchmark::State& state) {
+ const int num_threads = state.range(0);
+ ContextImpl context;
+ context.EnsureMinimumThreads(num_threads);
+
+ Vector x = Vector::Random(kVectorSize);
+ for (auto _ : state) {
+ ParallelSetZero(&context, num_threads, x);
+ }
+ CHECK_EQ(x.squaredNorm(), 0.);
+}
+BENCHMARK(SetZeroParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+static void Negate(benchmark::State& state) {
+ Vector x = Vector::Random(kVectorSize).normalized();
+ const Vector x_init = x;
+
+ for (auto _ : state) {
+ x = -x;
+ }
+ CHECK((x - x_init).squaredNorm() == 0. || (x + x_init).squaredNorm() == 0);
+}
+BENCHMARK(Negate);
+
+static void NegateParallel(benchmark::State& state) {
+ const int num_threads = state.range(0);
+ ContextImpl context;
+ context.EnsureMinimumThreads(num_threads);
+
+ Vector x = Vector::Random(kVectorSize).normalized();
+ const Vector x_init = x;
+
+ for (auto _ : state) {
+ ParallelAssign(&context, num_threads, x, -x);
+ }
+ CHECK((x - x_init).squaredNorm() == 0. || (x + x_init).squaredNorm() == 0);
+}
+BENCHMARK(NegateParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+static void Assign(benchmark::State& state) {
+ Vector x = Vector::Random(kVectorSize);
+ Vector y = Vector(kVectorSize);
+ for (auto _ : state) {
+ y.block(0, 0, kVectorSize, 1) = x.block(0, 0, kVectorSize, 1);
+ }
+ CHECK_EQ((y - x).squaredNorm(), 0.);
+}
+BENCHMARK(Assign);
+
+static void AssignParallel(benchmark::State& state) {
+ const int num_threads = state.range(0);
+ ContextImpl context;
+ context.EnsureMinimumThreads(num_threads);
+
+ Vector x = Vector::Random(kVectorSize);
+ Vector y = Vector(kVectorSize);
+
+ for (auto _ : state) {
+ ParallelAssign(&context, num_threads, y, x);
+ }
+ CHECK_EQ((y - x).squaredNorm(), 0.);
+}
+BENCHMARK(AssignParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+static void D2X(benchmark::State& state) {
+ const Vector x = Vector::Random(kVectorSize);
+ const Vector D = Vector::Random(kVectorSize);
+ Vector y = Vector::Zero(kVectorSize);
+ for (auto _ : state) {
+ y = D.array().square() * x.array();
+ }
+ CHECK_GT(y.squaredNorm(), 0.);
+}
+BENCHMARK(D2X);
+
+static void D2XParallel(benchmark::State& state) {
+ const int num_threads = state.range(0);
+ ContextImpl context;
+ context.EnsureMinimumThreads(num_threads);
+
+ const Vector x = Vector::Random(kVectorSize);
+ const Vector D = Vector::Random(kVectorSize);
+ Vector y = Vector(kVectorSize);
+
+ for (auto _ : state) {
+ ParallelAssign(&context, num_threads, y, D.array().square() * x.array());
+ }
+ CHECK_GT(y.squaredNorm(), 0.);
+}
+BENCHMARK(D2XParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+static void DivideSqrt(benchmark::State& state) {
+ Vector diagonal = Vector::Random(kVectorSize).array().abs();
+ const double radius = 0.5;
+ for (auto _ : state) {
+ diagonal = (diagonal / radius).array().sqrt();
+ }
+ CHECK_GT(diagonal.squaredNorm(), 0.);
+}
+BENCHMARK(DivideSqrt);
+
+static void DivideSqrtParallel(benchmark::State& state) {
+ const int num_threads = state.range(0);
+ ContextImpl context;
+ context.EnsureMinimumThreads(num_threads);
+
+ Vector diagonal = Vector::Random(kVectorSize).array().abs();
+ const double radius = 0.5;
+ for (auto _ : state) {
+ ParallelAssign(
+ &context, num_threads, diagonal, (diagonal / radius).cwiseSqrt());
+ }
+ CHECK_GT(diagonal.squaredNorm(), 0.);
+}
+BENCHMARK(DivideSqrtParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+static void Clamp(benchmark::State& state) {
+ Vector diagonal = Vector::Random(kVectorSize);
+ const double min = -0.5;
+ const double max = 0.5;
+ for (auto _ : state) {
+ for (int i = 0; i < kVectorSize; ++i) {
+ diagonal[i] = std::min(std::max(diagonal[i], min), max);
+ }
+ }
+ CHECK_LE(diagonal.maxCoeff(), 0.5);
+ CHECK_GE(diagonal.minCoeff(), -0.5);
+}
+BENCHMARK(Clamp);
+
+static void ClampParallel(benchmark::State& state) {
+ const int num_threads = state.range(0);
+ ContextImpl context;
+ context.EnsureMinimumThreads(num_threads);
+
+ Vector diagonal = Vector::Random(kVectorSize);
+ const double min = -0.5;
+ const double max = 0.5;
+ for (auto _ : state) {
+ ParallelAssign(
+ &context, num_threads, diagonal, diagonal.array().max(min).min(max));
+ }
+ CHECK_LE(diagonal.maxCoeff(), 0.5);
+ CHECK_GE(diagonal.minCoeff(), -0.5);
+}
+BENCHMARK(ClampParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+static void Norm(benchmark::State& state) {
+ const Vector x = Vector::Random(kVectorSize);
+
+ double total = 0.;
+ for (auto _ : state) {
+ total += x.norm();
+ }
+ CHECK_GT(total, 0.);
+}
+BENCHMARK(Norm);
+
+static void NormParallel(benchmark::State& state) {
+ const int num_threads = state.range(0);
+ ContextImpl context;
+ context.EnsureMinimumThreads(num_threads);
+
+ const Vector x = Vector::Random(kVectorSize);
+
+ double total = 0.;
+ for (auto _ : state) {
+ total += Norm(x, &context, num_threads);
+ }
+ CHECK_GT(total, 0.);
+}
+BENCHMARK(NormParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+static void Dot(benchmark::State& state) {
+ const Vector x = Vector::Random(kVectorSize);
+ const Vector y = Vector::Random(kVectorSize);
+
+ double total = 0.;
+ for (auto _ : state) {
+ total += x.dot(y);
+ }
+ CHECK_NE(total, 0.);
+}
+BENCHMARK(Dot);
+
+static void DotParallel(benchmark::State& state) {
+ const int num_threads = state.range(0);
+ ContextImpl context;
+ context.EnsureMinimumThreads(num_threads);
+
+ const Vector x = Vector::Random(kVectorSize);
+ const Vector y = Vector::Random(kVectorSize);
+
+ double total = 0.;
+ for (auto _ : state) {
+ total += Dot(x, y, &context, num_threads);
+ }
+ CHECK_NE(total, 0.);
+}
+BENCHMARK(DotParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+static void Axpby(benchmark::State& state) {
+ const Vector x = Vector::Random(kVectorSize);
+ const Vector y = Vector::Random(kVectorSize);
+ Vector z = Vector::Zero(kVectorSize);
+ const double a = 3.1415;
+ const double b = 1.2345;
+
+ double total = 0.;
+ for (auto _ : state) {
+ z = a * x + b * y;
+ }
+ CHECK_GT(z.squaredNorm(), 0.);
+}
+BENCHMARK(Axpby);
+
+static void AxpbyParallel(benchmark::State& state) {
+ const int num_threads = state.range(0);
+ ContextImpl context;
+ context.EnsureMinimumThreads(num_threads);
+
+ const Vector x = Vector::Random(kVectorSize);
+ const Vector y = Vector::Random(kVectorSize);
+ Vector z = Vector::Zero(kVectorSize);
+ const double a = 3.1415;
+ const double b = 1.2345;
+
+ double total = 0.;
+ for (auto _ : state) {
+ Axpby(a, x, b, y, z, &context, num_threads);
+ }
+ CHECK_GT(z.squaredNorm(), 0.);
+}
+BENCHMARK(AxpbyParallel)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+} // namespace ceres::internal
+
+BENCHMARK_MAIN();
diff --git a/internal/ceres/partitioned_matrix_view_test.cc b/internal/ceres/partitioned_matrix_view_test.cc
index 2ff99bf..f9fee84 100644
--- a/internal/ceres/partitioned_matrix_view_test.cc
+++ b/internal/ceres/partitioned_matrix_view_test.cc
@@ -68,7 +68,6 @@
CHECK(problem != nullptr);
A_ = std::move(problem->A);
auto block_sparse = down_cast<BlockSparseMatrix*>(A_.get());
- block_sparse->AddTransposeBlockStructure();
options_.num_threads = num_threads;
options_.context = &context_;
diff --git a/internal/ceres/preconditioner.cc b/internal/ceres/preconditioner.cc
index 33174de..3e2bf41 100644
--- a/internal/ceres/preconditioner.cc
+++ b/internal/ceres/preconditioner.cc
@@ -47,8 +47,8 @@
}
SparseMatrixPreconditionerWrapper::SparseMatrixPreconditionerWrapper(
- const SparseMatrix* matrix)
- : matrix_(matrix) {
+ const SparseMatrix* matrix, const Preconditioner::Options& options)
+ : matrix_(matrix), options_(options) {
CHECK(matrix != nullptr);
}
@@ -62,7 +62,8 @@
void SparseMatrixPreconditionerWrapper::RightMultiplyAndAccumulate(
const double* x, double* y) const {
- matrix_->RightMultiplyAndAccumulate(x, y);
+ matrix_->RightMultiplyAndAccumulate(
+ x, y, options_.context, options_.num_threads);
}
int SparseMatrixPreconditionerWrapper::num_rows() const {
diff --git a/internal/ceres/preconditioner.h b/internal/ceres/preconditioner.h
index 2d343bd..654ef60 100644
--- a/internal/ceres/preconditioner.h
+++ b/internal/ceres/preconditioner.h
@@ -186,7 +186,8 @@
: public SparseMatrixPreconditioner {
public:
// Wrapper does NOT take ownership of the matrix pointer.
- explicit SparseMatrixPreconditionerWrapper(const SparseMatrix* matrix);
+ explicit SparseMatrixPreconditionerWrapper(
+ const SparseMatrix* matrix, const Preconditioner::Options& options);
~SparseMatrixPreconditionerWrapper() override;
// Preconditioner interface
@@ -196,6 +197,7 @@
private:
bool UpdateImpl(const SparseMatrix& A, const double* D) override;
const SparseMatrix* matrix_;
+ const Preconditioner::Options options_;
};
} // namespace ceres::internal
diff --git a/internal/ceres/preprocessor.cc b/internal/ceres/preprocessor.cc
index 9d60ef9..286dfda 100644
--- a/internal/ceres/preprocessor.cc
+++ b/internal/ceres/preprocessor.cc
@@ -82,9 +82,11 @@
double* reduced_parameters = pp->reduced_parameters.data();
program->ParameterBlocksToStateVector(reduced_parameters);
+ auto context = pp->problem->context();
Minimizer::Options& minimizer_options = pp->minimizer_options;
minimizer_options = Minimizer::Options(options);
minimizer_options.evaluator = pp->evaluator;
+ minimizer_options.context = context;
if (options.logging_type != SILENT) {
pp->logging_callback = std::make_unique<LoggingCallback>(
diff --git a/internal/ceres/program.cc b/internal/ceres/program.cc
index 9a4ecf3..7549744 100644
--- a/internal/ceres/program.cc
+++ b/internal/ceres/program.cc
@@ -45,6 +45,7 @@
#include "ceres/loss_function.h"
#include "ceres/manifold.h"
#include "ceres/map_util.h"
+#include "ceres/parallel_for.h"
#include "ceres/parameter_block.h"
#include "ceres/problem.h"
#include "ceres/residual_block.h"
@@ -108,16 +109,32 @@
bool Program::Plus(const double* state,
const double* delta,
- double* state_plus_delta) const {
- for (auto* parameter_block : parameter_blocks_) {
- if (!parameter_block->Plus(state, delta, state_plus_delta)) {
- return false;
- }
- state += parameter_block->Size();
- delta += parameter_block->TangentSize();
- state_plus_delta += parameter_block->Size();
- }
- return true;
+ double* state_plus_delta,
+ ContextImpl* context,
+ int num_threads) const {
+ std::atomic<bool> abort(false);
+ auto* parameter_blocks = parameter_blocks_.data();
+ ParallelFor(
+ context,
+ 0,
+ parameter_blocks_.size(),
+ num_threads,
+ [&abort, state, delta, state_plus_delta, parameter_blocks](int block_id) {
+ if (abort) {
+ return;
+ }
+ auto parameter_block = parameter_blocks[block_id];
+
+ auto block_state = state + parameter_block->state_offset();
+ auto block_delta = delta + parameter_block->delta_offset();
+ auto block_state_plus_delta =
+ state_plus_delta + parameter_block->state_offset();
+ if (!parameter_block->Plus(
+ block_state, block_delta, block_state_plus_delta)) {
+ abort = true;
+ }
+ });
+ return abort == false;
}
void Program::SetParameterOffsetsAndIndex() {
diff --git a/internal/ceres/program.h b/internal/ceres/program.h
index cb923ae..7c0ceee 100644
--- a/internal/ceres/program.h
+++ b/internal/ceres/program.h
@@ -46,6 +46,7 @@
class ProblemImpl;
class ResidualBlock;
class TripletSparseMatrix;
+class ContextImpl;
// A nonlinear least squares optimization problem. This is different from the
// similarly-named "Problem" object, which offers a mutation interface for
@@ -86,7 +87,9 @@
// Update a state vector for the program given a delta.
bool Plus(const double* state,
const double* delta,
- double* state_plus_delta) const;
+ double* state_plus_delta,
+ ContextImpl* context,
+ int num_threads) const;
// Set the parameter indices and offsets. This permits mapping backward
// from a ParameterBlock* to an index in the parameter_blocks() vector. For
diff --git a/internal/ceres/program_evaluator.h b/internal/ceres/program_evaluator.h
index 251a177..cc4da9f 100644
--- a/internal/ceres/program_evaluator.h
+++ b/internal/ceres/program_evaluator.h
@@ -118,7 +118,8 @@
program_(program),
jacobian_writer_(options, program),
evaluate_preparers_(std::move(
- jacobian_writer_.CreateEvaluatePreparers(options.num_threads))) {
+ jacobian_writer_.CreateEvaluatePreparers(options.num_threads))),
+ num_parameters_(program->NumEffectiveParameters()) {
BuildResidualLayout(*program, &residual_layout_);
evaluate_scratch_ = std::move(CreateEvaluatorScratch(
*program, static_cast<unsigned>(options.num_threads)));
@@ -155,20 +156,24 @@
}
if (residuals != nullptr) {
- VectorRef(residuals, program_->NumResiduals()).setZero();
+ ParallelSetZero(options_.context,
+ options_.num_threads,
+ residuals,
+ program_->NumResiduals());
}
if (jacobian != nullptr) {
- jacobian->SetZero();
+ jacobian->SetZero(options_.context, options_.num_threads);
}
// Each thread gets it's own cost and evaluate scratch space.
for (int i = 0; i < options_.num_threads; ++i) {
evaluate_scratch_[i].cost = 0.0;
if (gradient != nullptr) {
- VectorRef(evaluate_scratch_[i].gradient.get(),
- program_->NumEffectiveParameters())
- .setZero();
+ ParallelSetZero(options_.context,
+ options_.num_threads,
+ evaluate_scratch_[i].gradient.get(),
+ num_parameters_);
}
}
@@ -251,18 +256,23 @@
});
if (!abort) {
- const int num_parameters = program_->NumEffectiveParameters();
-
// Sum the cost and gradient (if requested) from each thread.
(*cost) = 0.0;
if (gradient != nullptr) {
- VectorRef(gradient, num_parameters).setZero();
+ auto gradient_vector = VectorRef(gradient, num_parameters_);
+ ParallelSetZero(
+ options_.context, options_.num_threads, gradient_vector);
}
for (int i = 0; i < options_.num_threads; ++i) {
(*cost) += evaluate_scratch_[i].cost;
if (gradient != nullptr) {
- VectorRef(gradient, num_parameters) +=
- VectorRef(evaluate_scratch_[i].gradient.get(), num_parameters);
+ auto gradient_vector = VectorRef(gradient, num_parameters_);
+ ParallelAssign(
+ options_.context,
+ options_.num_threads,
+ gradient_vector,
+ gradient_vector + VectorRef(evaluate_scratch_[i].gradient.get(),
+ num_parameters_));
}
}
@@ -272,7 +282,7 @@
// necessary.
if (jacobian != nullptr) {
JacobianFinalizer f;
- f(jacobian, num_parameters);
+ f(jacobian, num_parameters_);
}
}
return !abort;
@@ -281,7 +291,8 @@
bool Plus(const double* state,
const double* delta,
double* state_plus_delta) const final {
- return program_->Plus(state, delta, state_plus_delta);
+ return program_->Plus(
+ state, delta, state_plus_delta, options_.context, options_.num_threads);
}
int NumParameters() const final { return program_->NumParameters(); }
@@ -361,6 +372,7 @@
std::unique_ptr<EvaluatePreparer[]> evaluate_preparers_;
std::unique_ptr<EvaluateScratch[]> evaluate_scratch_;
std::vector<int> residual_layout_;
+ int num_parameters_;
::ceres::internal::ExecutionSummary execution_summary_;
};
diff --git a/internal/ceres/sparse_matrix.cc b/internal/ceres/sparse_matrix.cc
index 40899bb..584ac1d 100644
--- a/internal/ceres/sparse_matrix.cc
+++ b/internal/ceres/sparse_matrix.cc
@@ -34,4 +34,20 @@
SparseMatrix::~SparseMatrix() = default;
+void SparseMatrix::SquaredColumnNorm(double* x,
+ ContextImpl* context,
+ int num_threads) const {
+ (void)context;
+ (void)num_threads;
+ SquaredColumnNorm(x);
+}
+
+void SparseMatrix::ScaleColumns(const double* x,
+ ContextImpl* context,
+ int num_threads) {
+ (void)context;
+ (void)num_threads;
+ ScaleColumns(x);
+}
+
} // namespace ceres::internal
diff --git a/internal/ceres/sparse_matrix.h b/internal/ceres/sparse_matrix.h
index deab7b1..6cc421b 100644
--- a/internal/ceres/sparse_matrix.h
+++ b/internal/ceres/sparse_matrix.h
@@ -69,19 +69,28 @@
~SparseMatrix() override;
// y += Ax;
+ using LinearOperator::RightMultiplyAndAccumulate;
void RightMultiplyAndAccumulate(const double* x,
double* y) const override = 0;
+
// y += A'x;
void LeftMultiplyAndAccumulate(const double* x, double* y) const override = 0;
// In MATLAB notation sum(A.*A, 1)
virtual void SquaredColumnNorm(double* x) const = 0;
+ virtual void SquaredColumnNorm(double* x,
+ ContextImpl* context,
+ int num_threads) const;
// A = A * diag(scale)
virtual void ScaleColumns(const double* scale) = 0;
+ virtual void ScaleColumns(const double* scale,
+ ContextImpl* context,
+ int num_threads);
// A = 0. A->num_nonzeros() == 0 is true after this call. The
// sparsity pattern is preserved.
virtual void SetZero() = 0;
+ virtual void SetZero(ContextImpl* contex, int num_threads) { SetZero(); }
// Resize and populate dense_matrix with a dense version of the
// sparse matrix.
diff --git a/internal/ceres/trust_region_minimizer.cc b/internal/ceres/trust_region_minimizer.cc
index 800b676..47f7fd5 100644
--- a/internal/ceres/trust_region_minimizer.cc
+++ b/internal/ceres/trust_region_minimizer.cc
@@ -42,9 +42,11 @@
#include "Eigen/Core"
#include "ceres/array_utils.h"
#include "ceres/coordinate_descent_minimizer.h"
+#include "ceres/eigen_vector_ops.h"
#include "ceres/evaluator.h"
#include "ceres/file.h"
#include "ceres/line_search.h"
+#include "ceres/parallel_for.h"
#include "ceres/stringprintf.h"
#include "ceres/types.h"
#include "ceres/wall_time.h"
@@ -268,7 +270,8 @@
}
// jacobian = jacobian * diag(J'J) ^{-1}
- jacobian_->ScaleColumns(jacobian_scaling_.data());
+ jacobian_->ScaleColumns(
+ jacobian_scaling_.data(), options_.context, options_.num_threads);
}
// The gradient exists in the local tangent space. To account for
@@ -419,11 +422,15 @@
// = f'f/2 - 1/2 [ f'f + 2f'J * step + step' * J' * J * step]
// = -f'J * step - step' * J' * J * step / 2
// = -(J * step)'(f + J * step / 2)
- model_residuals_.setZero();
+ ParallelSetZero(options_.context, options_.num_threads, model_residuals_);
jacobian_->RightMultiplyAndAccumulate(trust_region_step_.data(),
- model_residuals_.data());
- model_cost_change_ =
- -model_residuals_.dot(residuals_ + model_residuals_ / 2.0);
+ model_residuals_.data(),
+ options_.context,
+ options_.num_threads);
+ model_cost_change_ = -Dot(model_residuals_,
+ residuals_ + model_residuals_ / 2.0,
+ options_.context,
+ options_.num_threads);
// TODO(sameeragarwal)
//
@@ -433,7 +440,10 @@
iteration_summary_.step_is_valid = (model_cost_change_ > 0.0);
if (iteration_summary_.step_is_valid) {
// Undo the Jacobian column scaling.
- delta_ = (trust_region_step_.array() * jacobian_scaling_.array()).matrix();
+ ParallelAssign(options_.context,
+ options_.num_threads,
+ delta_,
+ (trust_region_step_.array() * jacobian_scaling_.array()));
num_consecutive_invalid_steps_ = 0;
}
diff --git a/internal/ceres/trust_region_preprocessor.cc b/internal/ceres/trust_region_preprocessor.cc
index 84e783f..67f928e 100644
--- a/internal/ceres/trust_region_preprocessor.cc
+++ b/internal/ceres/trust_region_preprocessor.cc
@@ -355,6 +355,8 @@
strategy_options.trust_region_strategy_type =
options.trust_region_strategy_type;
strategy_options.dogleg_type = options.dogleg_type;
+ strategy_options.context = pp->problem->context();
+ strategy_options.num_threads = options.num_threads;
pp->minimizer_options.trust_region_strategy =
TrustRegionStrategy::Create(strategy_options);
CHECK(pp->minimizer_options.trust_region_strategy != nullptr);
diff --git a/internal/ceres/trust_region_strategy.h b/internal/ceres/trust_region_strategy.h
index 334f06f..2be10f3 100644
--- a/internal/ceres/trust_region_strategy.h
+++ b/internal/ceres/trust_region_strategy.h
@@ -73,6 +73,9 @@
// Further specify which dogleg method to use
DoglegType dogleg_type = TRADITIONAL_DOGLEG;
+
+ ContextImpl* context = nullptr;
+ int num_threads = 1;
};
// Factory.