Parallel left products for PartitionedMatrixView
Parallel left products for PartitionedMatrixView using parallel for
loops with fair partitioning.
Updates for evaluation benchmark including replacing program with a
preprocessed one
Change-Id: Ia1cc3293f106cec6b7d933675cea4e7c5a6b71e4
diff --git a/internal/ceres/bundle_adjustment_test_util.h b/internal/ceres/bundle_adjustment_test_util.h
index 6b45d46..696b39e 100644
--- a/internal/ceres/bundle_adjustment_test_util.h
+++ b/internal/ceres/bundle_adjustment_test_util.h
@@ -84,14 +84,15 @@
Solver::Options* mutable_solver_options() { return &options_; }
// clang-format off
- int num_cameras() const { return num_cameras_; }
- int num_points() const { return num_points_; }
- int num_observations() const { return num_observations_; }
- const int* point_index() const { return point_index_; }
- const int* camera_index() const { return camera_index_; }
- const double* observations() const { return observations_; }
- double* mutable_cameras() { return parameters_; }
- double* mutable_points() { return parameters_ + 9 * num_cameras_; }
+ int num_cameras() const { return num_cameras_; }
+ int num_points() const { return num_points_; }
+ int num_observations() const { return num_observations_; }
+ const int* point_index() const { return point_index_; }
+ const int* camera_index() const { return camera_index_; }
+ const double* observations() const { return observations_; }
+ double* mutable_cameras() { return parameters_; }
+ double* mutable_points() { return parameters_ + 9 * num_cameras_; }
+ const Solver::Options& options() const { return options_; }
// clang-format on
static double kResidualTolerance;
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index 56b4f19..aba2386 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc
@@ -39,6 +39,8 @@
#include "ceres/cuda_sparse_matrix.h"
#include "ceres/cuda_vector.h"
#include "ceres/evaluator.h"
+#include "ceres/partitioned_matrix_view.h"
+#include "ceres/preprocessor.h"
#include "ceres/problem.h"
#include "ceres/problem_impl.h"
#include "ceres/program.h"
@@ -56,12 +58,22 @@
// each dataset is being loaded at most once.
// Each type of jacobians is also cached after first creation
struct BALData {
+ using PartitionedView = PartitionedMatrixView<2, 3, 9>;
explicit BALData(const std::string& path) {
bal_problem = std::make_unique<BundleAdjustmentProblem>(path);
CHECK(bal_problem != nullptr);
auto problem_impl = bal_problem->mutable_problem()->mutable_impl();
- auto program = problem_impl->mutable_program();
+ auto preprocessor = Preprocessor::Create(MinimizerType::TRUST_REGION);
+
+ preprocessed_problem = std::make_unique<PreprocessedProblem>();
+ Solver::Options options = bal_problem->options();
+ options.linear_solver_type = ITERATIVE_SCHUR;
+ CHECK(preprocessor->Preprocess(
+ options, problem_impl, preprocessed_problem.get()));
+
+ auto program = preprocessed_problem->reduced_program.get();
+
parameters.resize(program->NumParameters());
program->ParameterBlocksToStateVector(parameters.data());
}
@@ -76,10 +88,10 @@
options.linear_solver_type = ITERATIVE_SCHUR;
options.num_threads = 1;
options.context = context;
- options.num_eliminate_blocks = 0;
+ options.num_eliminate_blocks = bal_problem->num_points();
std::string error;
- auto program = problem_impl->mutable_program();
+ auto program = preprocessed_problem->reduced_program.get();
auto evaluator = Evaluator::Create(options, program, &error);
CHECK(evaluator != nullptr);
@@ -134,19 +146,39 @@
return crs_jacobian.get();
}
+ const PartitionedView* PartitionedMatrixViewJacobian(ContextImpl* context) {
+ if (!partitioned_view_jacobian) {
+ auto block_sparse = BlockSparseJacobian(context);
+ partitioned_view_jacobian = std::make_unique<PartitionedView>(
+ *block_sparse, bal_problem->num_points());
+ }
+ return partitioned_view_jacobian.get();
+ }
+
+ const PartitionedView* PartitionedMatrixViewJacobianWithTranspose(
+ ContextImpl* context) {
+ if (!partitioned_view_jacobian_with_transpose) {
+ auto block_sparse_transpose = BlockSparseJacobianWithTranspose(context);
+ partitioned_view_jacobian_with_transpose =
+ std::make_unique<PartitionedView>(*block_sparse_transpose,
+ bal_problem->num_points());
+ }
+ return partitioned_view_jacobian_with_transpose.get();
+ }
+
Vector parameters;
std::unique_ptr<BundleAdjustmentProblem> bal_problem;
+ std::unique_ptr<PreprocessedProblem> preprocessed_problem;
std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian;
std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian_with_transpose;
std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian;
+ std::unique_ptr<PartitionedView> partitioned_view_jacobian;
+ std::unique_ptr<PartitionedView> partitioned_view_jacobian_with_transpose;
};
static void Residuals(benchmark::State& state,
BALData* data,
ContextImpl* context) {
- auto problem = data->bal_problem->mutable_problem();
- auto problem_impl = problem->mutable_impl();
- CHECK(problem_impl != nullptr);
const int num_threads = state.range(0);
Evaluator::Options options;
@@ -156,7 +188,9 @@
options.num_eliminate_blocks = 0;
std::string error;
- auto program = problem_impl->mutable_program();
+ CHECK(data->preprocessed_problem != nullptr);
+ auto program = data->preprocessed_problem->reduced_program.get();
+ CHECK(program != nullptr);
auto evaluator = Evaluator::Create(options, program, &error);
CHECK(evaluator != nullptr);
@@ -177,9 +211,6 @@
static void ResidualsAndJacobian(benchmark::State& state,
BALData* data,
ContextImpl* context) {
- auto problem = data->bal_problem->mutable_problem();
- auto problem_impl = problem->mutable_impl();
- CHECK(problem_impl != nullptr);
const int num_threads = state.range(0);
Evaluator::Options options;
@@ -189,7 +220,9 @@
options.num_eliminate_blocks = 0;
std::string error;
- auto program = problem_impl->mutable_program();
+ CHECK(data->preprocessed_problem != nullptr);
+ auto program = data->preprocessed_problem->reduced_program.get();
+ CHECK(program != nullptr);
auto evaluator = Evaluator::Create(options, program, &error);
CHECK(evaluator != nullptr);
@@ -208,6 +241,107 @@
}
}
+static void PMVRightMultiplyAndAccumulateF(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ const int num_threads = state.range(0);
+
+ auto jacobian = data->PartitionedMatrixViewJacobian(context);
+
+ Vector y = Vector::Zero(jacobian->num_rows());
+ Vector x = Vector::Random(jacobian->num_cols_f());
+
+ for (auto _ : state) {
+ jacobian->RightMultiplyAndAccumulateF(
+ x.data(), y.data(), context, num_threads);
+ }
+ CHECK_GT(y.squaredNorm(), 0.);
+}
+
+static void PMVLeftMultiplyAndAccumulateF(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ const int num_threads = state.range(0);
+
+ auto jacobian = data->PartitionedMatrixViewJacobian(context);
+
+ Vector y = Vector::Zero(jacobian->num_cols_f());
+ Vector x = Vector::Random(jacobian->num_rows());
+
+ for (auto _ : state) {
+ jacobian->LeftMultiplyAndAccumulateF(
+ x.data(), y.data(), context, num_threads);
+ }
+ CHECK_GT(y.squaredNorm(), 0.);
+}
+
+static void PMVLeftMultiplyAndAccumulateWithTransposeF(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ const int num_threads = state.range(0);
+
+ auto jacobian = data->PartitionedMatrixViewJacobianWithTranspose(context);
+
+ Vector y = Vector::Zero(jacobian->num_cols_f());
+ Vector x = Vector::Random(jacobian->num_rows());
+
+ for (auto _ : state) {
+ jacobian->LeftMultiplyAndAccumulateF(
+ x.data(), y.data(), context, num_threads);
+ }
+ CHECK_GT(y.squaredNorm(), 0.);
+}
+static void PMVRightMultiplyAndAccumulateE(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ const int num_threads = state.range(0);
+
+ auto jacobian = data->PartitionedMatrixViewJacobian(context);
+
+ Vector y = Vector::Zero(jacobian->num_rows());
+ Vector x = Vector::Random(jacobian->num_cols_e());
+
+ for (auto _ : state) {
+ jacobian->RightMultiplyAndAccumulateE(
+ x.data(), y.data(), context, num_threads);
+ }
+ CHECK_GT(y.squaredNorm(), 0.);
+}
+
+static void PMVLeftMultiplyAndAccumulateE(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ const int num_threads = state.range(0);
+
+ auto jacobian = data->PartitionedMatrixViewJacobian(context);
+
+ Vector y = Vector::Zero(jacobian->num_cols_e());
+ Vector x = Vector::Random(jacobian->num_rows());
+
+ for (auto _ : state) {
+ jacobian->LeftMultiplyAndAccumulateE(
+ x.data(), y.data(), context, num_threads);
+ }
+ CHECK_GT(y.squaredNorm(), 0.);
+}
+
+static void PMVLeftMultiplyAndAccumulateWithTransposeE(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ const int num_threads = state.range(0);
+
+ auto jacobian = data->PartitionedMatrixViewJacobianWithTranspose(context);
+
+ Vector y = Vector::Zero(jacobian->num_cols_e());
+ Vector x = Vector::Random(jacobian->num_rows());
+
+ for (auto _ : state) {
+ jacobian->LeftMultiplyAndAccumulateE(
+ x.data(), y.data(), context, num_threads);
+ }
+ CHECK_GT(y.squaredNorm(), 0.);
+}
+
static void JacobianRightMultiplyAndAccumulate(benchmark::State& state,
BALData* data,
ContextImpl* context) {
@@ -349,6 +483,7 @@
->Arg(4)
->Arg(8)
->Arg(16);
+
const std::string name_jacobians = "ResidualsAndJacobian<" + path + ">";
::benchmark::RegisterBenchmark(name_jacobians.c_str(),
ceres::internal::ResidualsAndJacobian,
@@ -359,6 +494,7 @@
->Arg(4)
->Arg(8)
->Arg(16);
+
const std::string name_right_product =
"JacobianRightMultiplyAndAccumulate<" + path + ">";
::benchmark::RegisterBenchmark(
@@ -372,6 +508,32 @@
->Arg(8)
->Arg(16);
+ const std::string name_right_product_partitioned_f =
+ "PMVRightMultiplyAndAccumulateF<" + path + ">";
+ ::benchmark::RegisterBenchmark(
+ name_right_product_partitioned_f.c_str(),
+ ceres::internal::PMVRightMultiplyAndAccumulateF,
+ data,
+ &context)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
+
+ const std::string name_right_product_partitioned_e =
+ "PMVRightMultiplyAndAccumulateE<" + path + ">";
+ ::benchmark::RegisterBenchmark(
+ name_right_product_partitioned_e.c_str(),
+ ceres::internal::PMVRightMultiplyAndAccumulateE,
+ data,
+ &context)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
+
#ifndef CERES_NO_CUDA
const std::string name_right_product_cuda =
"JacobianRightMultiplyAndAccumulateCuda<" + path + ">";
@@ -403,8 +565,50 @@
->Arg(2)
->Arg(4)
->Arg(8)
- ->Arg(16)
- ->Arg(28);
+ ->Arg(16);
+
+ const std::string name_left_product_partitioned_f =
+ "PMVLeftMultiplyAndAccumulateF<" + path + ">";
+ ::benchmark::RegisterBenchmark(
+ name_left_product_partitioned_f.c_str(),
+ ceres::internal::PMVLeftMultiplyAndAccumulateF,
+ data,
+ &context)
+ ->Arg(1);
+ const std::string name_left_product_partitioned_transpose_f =
+ "PMVLeftMultiplyAndAccumulateWithTransposeF<" + path + ">";
+ ::benchmark::RegisterBenchmark(
+ name_left_product_partitioned_transpose_f.c_str(),
+ ceres::internal::PMVLeftMultiplyAndAccumulateWithTransposeF,
+ data,
+ &context)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
+
+ const std::string name_left_product_partitioned_e =
+ "PMVLeftMultiplyAndAccumulateE<" + path + ">";
+ ::benchmark::RegisterBenchmark(
+ name_left_product_partitioned_e.c_str(),
+ ceres::internal::PMVLeftMultiplyAndAccumulateE,
+ data,
+ &context)
+ ->Arg(1);
+
+ const std::string name_left_product_partitioned_transpose_e =
+ "PMVLeftMultiplyAndAccumulateWithTransposeE<" + path + ">";
+ ::benchmark::RegisterBenchmark(
+ name_left_product_partitioned_transpose_e.c_str(),
+ ceres::internal::PMVLeftMultiplyAndAccumulateWithTransposeE,
+ data,
+ &context)
+ ->Arg(1)
+ ->Arg(2)
+ ->Arg(4)
+ ->Arg(8)
+ ->Arg(16);
#ifndef CERES_NO_CUDA
const std::string name_left_product_cuda =
diff --git a/internal/ceres/implicit_schur_complement.cc b/internal/ceres/implicit_schur_complement.cc
index bbd5e4b..6249c8c 100644
--- a/internal/ceres/implicit_schur_complement.cc
+++ b/internal/ceres/implicit_schur_complement.cc
@@ -110,7 +110,10 @@
// y2 = E' y1
tmp_e_cols_.setZero();
- A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());
+ A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(),
+ tmp_e_cols_.data(),
+ options_.context,
+ options_.num_threads);
// y3 = -(E'E)^-1 y2
tmp_e_cols_2_.setZero();
@@ -137,7 +140,8 @@
}
// y = y5 + F' y1
- A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), y);
+ A_->LeftMultiplyAndAccumulateF(
+ tmp_rows_.data(), y, options_.context, options_.num_threads);
}
void ImplicitSchurComplement::InversePowerSeriesOperatorRightMultiplyAccumulate(
@@ -150,7 +154,10 @@
// y2 = E' y1
tmp_e_cols_.setZero();
- A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());
+ A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(),
+ tmp_e_cols_.data(),
+ options_.context,
+ options_.num_threads);
// y3 = (E'E)^-1 y2
tmp_e_cols_2_.setZero();
@@ -167,7 +174,10 @@
// y4 = F' y1
tmp_f_cols_.setZero();
- A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), tmp_f_cols_.data());
+ A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(),
+ tmp_f_cols_.data(),
+ options_.context,
+ options_.num_threads);
// y += (F'F)^-1 y4
block_diagonal_FtF_inverse_->RightMultiplyAndAccumulate(
@@ -217,7 +227,10 @@
// y3 = E' y2
tmp_e_cols_.setZero();
- A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data());
+ A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(),
+ tmp_e_cols_.data(),
+ options_.context,
+ options_.num_threads);
// y = (E'E)^-1 y3
VectorRef(y, num_cols).setZero();
@@ -241,7 +254,8 @@
void ImplicitSchurComplement::UpdateRhs() {
// y1 = E'b
tmp_e_cols_.setZero();
- A_->LeftMultiplyAndAccumulateE(b_, tmp_e_cols_.data());
+ A_->LeftMultiplyAndAccumulateE(
+ b_, tmp_e_cols_.data(), options_.context, options_.num_threads);
// y2 = (E'E)^-1 y1
Vector y2 = Vector::Zero(A_->num_cols_e());
@@ -258,7 +272,8 @@
// rhs = F' y3
rhs_.setZero();
- A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), rhs_.data());
+ A_->LeftMultiplyAndAccumulateF(
+ tmp_rows_.data(), rhs_.data(), options_.context, options_.num_threads);
}
} // namespace ceres::internal
diff --git a/internal/ceres/iterative_schur_complement_solver.cc b/internal/ceres/iterative_schur_complement_solver.cc
index db4ad2b..589a1d2 100644
--- a/internal/ceres/iterative_schur_complement_solver.cc
+++ b/internal/ceres/iterative_schur_complement_solver.cc
@@ -68,6 +68,9 @@
EventLogger event_logger("IterativeSchurComplementSolver::Solve");
CHECK(A->block_structure() != nullptr);
+ if (options_.num_threads != 1) {
+ A->AddTransposeBlockStructure();
+ }
const int num_eliminate_blocks = options_.elimination_groups[0];
// Initialize a ImplicitSchurComplement object.
if (schur_complement_ == nullptr) {
diff --git a/internal/ceres/linear_least_squares_problems.cc b/internal/ceres/linear_least_squares_problems.cc
index 6cee2f6..4c55fbc 100644
--- a/internal/ceres/linear_least_squares_problems.cc
+++ b/internal/ceres/linear_least_squares_problems.cc
@@ -63,6 +63,8 @@
return LinearLeastSquaresProblem4();
case 5:
return LinearLeastSquaresProblem5();
+ case 6:
+ return LinearLeastSquaresProblem6();
default:
LOG(FATAL) << "Unknown problem id requested " << id;
}
@@ -798,6 +800,133 @@
return problem;
}
+/*
+ A = [1 2 0 0 0 1 1
+ 1 4 0 0 0 5 6
+ 3 4 0 0 0 7 8
+ 5 6 0 0 0 9 0
+ 0 0 9 0 0 3 1]
+
+ b = [0
+ 1
+ 2
+ 3
+ 4]
+*/
+// BlockSparseMatrix version
+//
+// This problem has the unique property that it has two different
+// sized f-blocks, but only one of them occurs in the rows involving
+// the one e-block. So performing Schur elimination on this problem
+// tests the Schur Eliminator's ability to handle non-e-block rows
+// correctly when their structure does not conform to the static
+// structure determined by DetectStructure.
+//
+// Additionally, this problem has the first row of the last row block of E being
+// larger than number of row blocks in E
+//
+// NOTE: This problem is too small and rank deficient to be solved without
+// the diagonal regularization.
+std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem6() {
+ int num_rows = 5;
+ int num_cols = 7;
+
+ auto problem = std::make_unique<LinearLeastSquaresProblem>();
+
+ problem->b = std::make_unique<double[]>(num_rows);
+ problem->D = std::make_unique<double[]>(num_cols);
+ problem->num_eliminate_blocks = 1;
+
+ auto* bs = new CompressedRowBlockStructure;
+ auto values = std::make_unique<double[]>(num_rows * num_cols);
+
+ // Column block structure
+ bs->cols.emplace_back();
+ bs->cols.back().size = 2;
+ bs->cols.back().position = 0;
+
+ bs->cols.emplace_back();
+ bs->cols.back().size = 3;
+ bs->cols.back().position = 2;
+
+ bs->cols.emplace_back();
+ bs->cols.back().size = 2;
+ bs->cols.back().position = 5;
+
+ int nnz = 0;
+
+ // Row 1 & 2
+ {
+ bs->rows.emplace_back();
+ CompressedRow& row = bs->rows.back();
+ row.block.size = 2;
+ row.block.position = 0;
+
+ row.cells.emplace_back(0, nnz);
+ values[nnz++] = 1;
+ values[nnz++] = 2;
+ values[nnz++] = 1;
+ values[nnz++] = 4;
+
+ row.cells.emplace_back(2, nnz);
+ values[nnz++] = 1;
+ values[nnz++] = 1;
+ values[nnz++] = 5;
+ values[nnz++] = 6;
+ }
+
+ // Row 3 & 4
+ {
+ bs->rows.emplace_back();
+ CompressedRow& row = bs->rows.back();
+ row.block.size = 2;
+ row.block.position = 2;
+
+ row.cells.emplace_back(0, nnz);
+ values[nnz++] = 3;
+ values[nnz++] = 4;
+ values[nnz++] = 5;
+ values[nnz++] = 6;
+
+ row.cells.emplace_back(2, nnz);
+ values[nnz++] = 7;
+ values[nnz++] = 8;
+ values[nnz++] = 9;
+ values[nnz++] = 0;
+ }
+
+ // Row 5
+ {
+ bs->rows.emplace_back();
+ CompressedRow& row = bs->rows.back();
+ row.block.size = 1;
+ row.block.position = 4;
+
+ row.cells.emplace_back(1, nnz);
+ values[nnz++] = 9;
+ values[nnz++] = 0;
+ values[nnz++] = 0;
+
+ row.cells.emplace_back(2, nnz);
+ values[nnz++] = 3;
+ values[nnz++] = 1;
+ }
+
+ auto A = std::make_unique<BlockSparseMatrix>(bs);
+ memcpy(A->mutable_values(), values.get(), nnz * sizeof(*A->values()));
+
+ for (int i = 0; i < num_cols; ++i) {
+ problem->D.get()[i] = (i + 1) * 100;
+ }
+
+ for (int i = 0; i < num_rows; ++i) {
+ problem->b.get()[i] = i;
+ }
+
+ problem->A = std::move(A);
+ return problem;
+}
+
namespace {
bool DumpLinearLeastSquaresProblemToConsole(const SparseMatrix* A,
const double* D,
diff --git a/internal/ceres/linear_least_squares_problems.h b/internal/ceres/linear_least_squares_problems.h
index 0e14517..c56a338 100644
--- a/internal/ceres/linear_least_squares_problems.h
+++ b/internal/ceres/linear_least_squares_problems.h
@@ -75,6 +75,8 @@
std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem4();
CERES_NO_EXPORT
std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem5();
+CERES_NO_EXPORT
+std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem6();
// Write the linear least squares problem to disk. The exact format
// depends on dump_format_type.
diff --git a/internal/ceres/partitioned_matrix_view.h b/internal/ceres/partitioned_matrix_view.h
index 86106da..533a0e5 100644
--- a/internal/ceres/partitioned_matrix_view.h
+++ b/internal/ceres/partitioned_matrix_view.h
@@ -70,9 +70,17 @@
// y += E'x
virtual void LeftMultiplyAndAccumulateE(const double* x, double* y) const = 0;
+ virtual void LeftMultiplyAndAccumulateE(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const = 0;
// y += F'x
virtual void LeftMultiplyAndAccumulateF(const double* x, double* y) const = 0;
+ virtual void LeftMultiplyAndAccumulateF(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const = 0;
// y += Ex
virtual void RightMultiplyAndAccumulateE(const double* x,
@@ -138,6 +146,14 @@
void LeftMultiplyAndAccumulateE(const double* x, double* y) const final;
void LeftMultiplyAndAccumulateF(const double* x, double* y) const final;
+ void LeftMultiplyAndAccumulateE(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const final;
+ void LeftMultiplyAndAccumulateF(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const final;
void RightMultiplyAndAccumulateE(const double* x, double* y) const final;
void RightMultiplyAndAccumulateF(const double* x, double* y) const final;
void RightMultiplyAndAccumulateE(const double* x,
diff --git a/internal/ceres/partitioned_matrix_view_impl.h b/internal/ceres/partitioned_matrix_view_impl.h
index fea0a7d..689f09e 100644
--- a/internal/ceres/partitioned_matrix_view_impl.h
+++ b/internal/ceres/partitioned_matrix_view_impl.h
@@ -188,6 +188,9 @@
template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
LeftMultiplyAndAccumulateE(const double* x, double* y) const {
+ if (!num_col_blocks_e_) return;
+ if (!num_row_blocks_e_) return;
+
const CompressedRowBlockStructure* bs = matrix_.block_structure();
// Iterate over the first num_row_blocks_e_ row blocks, and multiply
@@ -211,7 +214,54 @@
template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+ LeftMultiplyAndAccumulateE(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const {
+ if (!num_col_blocks_e_) return;
+ if (!num_row_blocks_e_) return;
+
+ auto transpose_bs = matrix_.transpose_block_structure();
+ if (transpose_bs == nullptr || num_threads == 1) {
+ LeftMultiplyAndAccumulateE(x, y);
+ return;
+ }
+
+ // Local copies of class members in order to avoid capturing pointer to the
+ // whole object in lambda function
+ auto values = matrix_.values();
+ const int num_row_blocks_e = num_row_blocks_e_;
+ ParallelFor(
+ context,
+ 0,
+ num_col_blocks_e_,
+ num_threads,
+ [values, transpose_bs, num_row_blocks_e, x, y](int row_block_id) {
+ int row_block_pos = transpose_bs->rows[row_block_id].block.position;
+ int row_block_size = transpose_bs->rows[row_block_id].block.size;
+ auto& cells = transpose_bs->rows[row_block_id].cells;
+
+ for (auto& cell : cells) {
+ const int col_block_id = cell.block_id;
+ const int col_block_size = transpose_bs->cols[col_block_id].size;
+ const int col_block_pos = transpose_bs->cols[col_block_id].position;
+ if (col_block_id >= num_row_blocks_e) break;
+ MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
+ values + cell.position,
+ col_block_size,
+ row_block_size,
+ x + col_block_pos,
+ y + row_block_pos);
+ }
+ },
+ transpose_bs->rows.data(),
+ [](const CompressedRow& row) { return row.cumulative_nnz; });
+}
+
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
LeftMultiplyAndAccumulateF(const double* x, double* y) const {
+ if (!num_col_blocks_f_) return;
const CompressedRowBlockStructure* bs = matrix_.block_structure();
// Iterate over row blocks, and if the row block is in E, then
@@ -255,6 +305,67 @@
}
}
+template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
+void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>::
+ LeftMultiplyAndAccumulateF(const double* x,
+ double* y,
+ ContextImpl* context,
+ int num_threads) const {
+ if (!num_col_blocks_f_) return;
+ auto transpose_bs = matrix_.transpose_block_structure();
+ if (transpose_bs == nullptr || num_threads == 1) {
+ LeftMultiplyAndAccumulateF(x, y);
+ return;
+ }
+ // Local copies of class members in order to avoid capturing pointer to the
+ // whole object in lambda function
+ auto values = matrix_.values();
+ const int num_row_blocks_e = num_row_blocks_e_;
+ const int num_cols_e = num_cols_e_;
+ ParallelFor(
+ context,
+ num_col_blocks_e_,
+ num_col_blocks_e_ + num_col_blocks_f_,
+ num_threads,
+ [values, transpose_bs, num_row_blocks_e, num_cols_e, x, y](
+ int row_block_id) {
+ int row_block_pos = transpose_bs->rows[row_block_id].block.position;
+ int row_block_size = transpose_bs->rows[row_block_id].block.size;
+ auto& cells = transpose_bs->rows[row_block_id].cells;
+
+ const int num_cells = cells.size();
+ int cell_idx = 0;
+ for (; cell_idx < num_cells; ++cell_idx) {
+ auto& cell = cells[cell_idx];
+ const int col_block_id = cell.block_id;
+ const int col_block_size = transpose_bs->cols[col_block_id].size;
+ const int col_block_pos = transpose_bs->cols[col_block_id].position;
+ if (col_block_id >= num_row_blocks_e) break;
+
+ MatrixTransposeVectorMultiply<kRowBlockSize, kFBlockSize, 1>(
+ values + cell.position,
+ col_block_size,
+ row_block_size,
+ x + col_block_pos,
+ y + row_block_pos - num_cols_e);
+ }
+ for (; cell_idx < num_cells; ++cell_idx) {
+ auto& cell = cells[cell_idx];
+ const int col_block_id = cell.block_id;
+ const int col_block_size = transpose_bs->cols[col_block_id].size;
+ const int col_block_pos = transpose_bs->cols[col_block_id].position;
+ MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+ values + cell.position,
+ col_block_size,
+ row_block_size,
+ x + col_block_pos,
+ y + row_block_pos - num_cols_e);
+ }
+ },
+ transpose_bs->rows.data(),
+ [](const CompressedRow& row) { return row.cumulative_nnz; });
+}
+
// Given a range of columns blocks of a matrix m, compute the block
// structure of the block diagonal of the matrix m(:,
// start_col_block:end_col_block)'m(:, start_col_block:end_col_block)
diff --git a/internal/ceres/partitioned_matrix_view_test.cc b/internal/ceres/partitioned_matrix_view_test.cc
index b11ea8e..4653e06 100644
--- a/internal/ceres/partitioned_matrix_view_test.cc
+++ b/internal/ceres/partitioned_matrix_view_test.cc
@@ -177,30 +177,40 @@
const int kMaxNumThreads = 8;
class PartitionedMatrixViewParallelTest : public ::testing::TestWithParam<int> {
protected:
+ static const int kNumProblems = 3;
void SetUp() final {
- std::unique_ptr<LinearLeastSquaresProblem> problem =
- CreateLinearLeastSquaresProblemFromId(2);
- CHECK(problem != nullptr);
- A_ = std::move(problem->A);
+ int problem_ids[kNumProblems] = {2, 4, 6};
+ for (int i = 0; i < kNumProblems; ++i) {
+ auto problem = CreateLinearLeastSquaresProblemFromId(problem_ids[i]);
+ CHECK(problem != nullptr);
+ SetUpMatrix(i, problem.get());
+ }
- num_cols_ = A_->num_cols();
- num_rows_ = A_->num_rows();
- num_eliminate_blocks_ = problem->num_eliminate_blocks;
- LinearSolver::Options options;
- options.elimination_groups.push_back(num_eliminate_blocks_);
- pmv_ = PartitionedMatrixViewBase::Create(
- options, *down_cast<BlockSparseMatrix*>(A_.get()));
context_.EnsureMinimumThreads(kMaxNumThreads);
}
+ void SetUpMatrix(int id, LinearLeastSquaresProblem* problem) {
+ A_[id] = std::move(problem->A);
+ auto& A = A_[id];
+ auto block_sparse = down_cast<BlockSparseMatrix*>(A.get());
+ block_sparse->AddTransposeBlockStructure();
+
+ num_cols_[id] = A->num_cols();
+ num_rows_[id] = A->num_rows();
+ num_eliminate_blocks_[id] = problem->num_eliminate_blocks;
+ LinearSolver::Options options;
+ options.elimination_groups.push_back(num_eliminate_blocks_[id]);
+ pmv_[id] = PartitionedMatrixViewBase::Create(options, *block_sparse);
+ }
+
double RandDouble() { return distribution_(prng_); }
ContextImpl context_;
- int num_rows_;
- int num_cols_;
- int num_eliminate_blocks_;
- std::unique_ptr<SparseMatrix> A_;
- std::unique_ptr<PartitionedMatrixViewBase> pmv_;
+ int num_rows_[kNumProblems];
+ int num_cols_[kNumProblems];
+ int num_eliminate_blocks_[kNumProblems];
+ std::unique_ptr<SparseMatrix> A_[kNumProblems];
+ std::unique_ptr<PartitionedMatrixViewBase> pmv_[kNumProblems];
std::mt19937 prng_;
std::uniform_real_distribution<double> distribution_ =
std::uniform_real_distribution<double>(0.0, 1.0);
@@ -208,23 +218,83 @@
TEST_P(PartitionedMatrixViewParallelTest, RightMultiplyAndAccumulateEParallel) {
const int kNumThreads = GetParam();
- Vector x1(pmv_->num_cols_e());
- Vector x2(pmv_->num_cols());
- x2.setZero();
+ for (int p = 0; p < kNumProblems; ++p) {
+ auto& pmv = pmv_[p];
+ auto& A = A_[p];
- for (int i = 0; i < pmv_->num_cols_e(); ++i) {
- x1(i) = x2(i) = RandDouble();
+ Vector x1(pmv->num_cols_e());
+ Vector x2(pmv->num_cols());
+ x2.setZero();
+
+ for (int i = 0; i < pmv->num_cols_e(); ++i) {
+ x1(i) = x2(i) = RandDouble();
+ }
+
+ Vector y1 = Vector::Zero(pmv->num_rows());
+ pmv->RightMultiplyAndAccumulateE(
+ x1.data(), y1.data(), &context_, kNumThreads);
+
+ Vector y2 = Vector::Zero(pmv->num_rows());
+ A->RightMultiplyAndAccumulate(x2.data(), y2.data());
+
+ for (int i = 0; i < pmv->num_rows(); ++i) {
+ EXPECT_NEAR(y1(i), y2(i), kEpsilon);
+ }
}
+}
- Vector y1 = Vector::Zero(pmv_->num_rows());
- pmv_->RightMultiplyAndAccumulateE(
- x1.data(), y1.data(), &context_, kNumThreads);
+TEST_P(PartitionedMatrixViewParallelTest, RightMultiplyAndAccumulateFParallel) {
+ const int kNumThreads = GetParam();
+ for (int p = 0; p < kNumProblems; ++p) {
+ auto& pmv = pmv_[p];
+ auto& A = A_[p];
+ Vector x1(pmv->num_cols_f());
+ Vector x2(pmv->num_cols());
+ x2.setZero();
- Vector y2 = Vector::Zero(pmv_->num_rows());
- A_->RightMultiplyAndAccumulate(x2.data(), y2.data());
+ for (int i = 0; i < pmv->num_cols_f(); ++i) {
+ x1(i) = x2(i + pmv->num_cols_e()) = RandDouble();
+ }
- for (int i = 0; i < pmv_->num_rows(); ++i) {
- EXPECT_NEAR(y1(i), y2(i), kEpsilon);
+ Vector y1 = Vector::Zero(pmv->num_rows());
+ pmv->RightMultiplyAndAccumulateF(
+ x1.data(), y1.data(), &context_, kNumThreads);
+
+ Vector y2 = Vector::Zero(pmv->num_rows());
+ A->RightMultiplyAndAccumulate(x2.data(), y2.data());
+
+ for (int i = 0; i < pmv->num_rows(); ++i) {
+ EXPECT_NEAR(y1(i), y2(i), kEpsilon);
+ }
+ }
+}
+
+TEST_P(PartitionedMatrixViewParallelTest, LeftMultiplyAndAccumulateParallel) {
+ const int kNumThreads = GetParam();
+ for (int p = 0; p < kNumProblems; ++p) {
+ auto& pmv = pmv_[p];
+ auto& A = A_[p];
+ Vector x = Vector::Zero(pmv->num_rows());
+ for (int i = 0; i < pmv->num_rows(); ++i) {
+ x(i) = RandDouble();
+ }
+ Vector x_pre = x;
+
+ Vector y = Vector::Zero(pmv->num_cols());
+ Vector y1 = Vector::Zero(pmv->num_cols_e());
+ Vector y2 = Vector::Zero(pmv->num_cols_f());
+
+ A->LeftMultiplyAndAccumulate(x.data(), y.data());
+ pmv->LeftMultiplyAndAccumulateE(
+ x.data(), y1.data(), &context_, kNumThreads);
+ pmv->LeftMultiplyAndAccumulateF(
+ x.data(), y2.data(), &context_, kNumThreads);
+
+ for (int i = 0; i < pmv->num_cols(); ++i) {
+ EXPECT_NEAR(y(i),
+ (i < pmv->num_cols_e()) ? y1(i) : y2(i - pmv->num_cols_e()),
+ kEpsilon);
+ }
}
}