Parallel left products for PartitionedMatrixView Parallel left products for PartitionedMatrixView using parallel for loops with fair partitioning. Updates for evaluation benchmark including replacing program with a preprocessed one Change-Id: Ia1cc3293f106cec6b7d933675cea4e7c5a6b71e4
diff --git a/internal/ceres/bundle_adjustment_test_util.h b/internal/ceres/bundle_adjustment_test_util.h index 6b45d46..696b39e 100644 --- a/internal/ceres/bundle_adjustment_test_util.h +++ b/internal/ceres/bundle_adjustment_test_util.h
@@ -84,14 +84,15 @@ Solver::Options* mutable_solver_options() { return &options_; } // clang-format off - int num_cameras() const { return num_cameras_; } - int num_points() const { return num_points_; } - int num_observations() const { return num_observations_; } - const int* point_index() const { return point_index_; } - const int* camera_index() const { return camera_index_; } - const double* observations() const { return observations_; } - double* mutable_cameras() { return parameters_; } - double* mutable_points() { return parameters_ + 9 * num_cameras_; } + int num_cameras() const { return num_cameras_; } + int num_points() const { return num_points_; } + int num_observations() const { return num_observations_; } + const int* point_index() const { return point_index_; } + const int* camera_index() const { return camera_index_; } + const double* observations() const { return observations_; } + double* mutable_cameras() { return parameters_; } + double* mutable_points() { return parameters_ + 9 * num_cameras_; } + const Solver::Options& options() const { return options_; } // clang-format on static double kResidualTolerance;
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc index 56b4f19..aba2386 100644 --- a/internal/ceres/evaluation_benchmark.cc +++ b/internal/ceres/evaluation_benchmark.cc
@@ -39,6 +39,8 @@ #include "ceres/cuda_sparse_matrix.h" #include "ceres/cuda_vector.h" #include "ceres/evaluator.h" +#include "ceres/partitioned_matrix_view.h" +#include "ceres/preprocessor.h" #include "ceres/problem.h" #include "ceres/problem_impl.h" #include "ceres/program.h" @@ -56,12 +58,22 @@ // each dataset is being loaded at most once. // Each type of jacobians is also cached after first creation struct BALData { + using PartitionedView = PartitionedMatrixView<2, 3, 9>; explicit BALData(const std::string& path) { bal_problem = std::make_unique<BundleAdjustmentProblem>(path); CHECK(bal_problem != nullptr); auto problem_impl = bal_problem->mutable_problem()->mutable_impl(); - auto program = problem_impl->mutable_program(); + auto preprocessor = Preprocessor::Create(MinimizerType::TRUST_REGION); + + preprocessed_problem = std::make_unique<PreprocessedProblem>(); + Solver::Options options = bal_problem->options(); + options.linear_solver_type = ITERATIVE_SCHUR; + CHECK(preprocessor->Preprocess( + options, problem_impl, preprocessed_problem.get())); + + auto program = preprocessed_problem->reduced_program.get(); + parameters.resize(program->NumParameters()); program->ParameterBlocksToStateVector(parameters.data()); } @@ -76,10 +88,10 @@ options.linear_solver_type = ITERATIVE_SCHUR; options.num_threads = 1; options.context = context; - options.num_eliminate_blocks = 0; + options.num_eliminate_blocks = bal_problem->num_points(); std::string error; - auto program = problem_impl->mutable_program(); + auto program = preprocessed_problem->reduced_program.get(); auto evaluator = Evaluator::Create(options, program, &error); CHECK(evaluator != nullptr); @@ -134,19 +146,39 @@ return crs_jacobian.get(); } + const PartitionedView* PartitionedMatrixViewJacobian(ContextImpl* context) { + if (!partitioned_view_jacobian) { + auto block_sparse = BlockSparseJacobian(context); + partitioned_view_jacobian = std::make_unique<PartitionedView>( + *block_sparse, bal_problem->num_points()); + } + return partitioned_view_jacobian.get(); + } + + const PartitionedView* PartitionedMatrixViewJacobianWithTranspose( + ContextImpl* context) { + if (!partitioned_view_jacobian_with_transpose) { + auto block_sparse_transpose = BlockSparseJacobianWithTranspose(context); + partitioned_view_jacobian_with_transpose = + std::make_unique<PartitionedView>(*block_sparse_transpose, + bal_problem->num_points()); + } + return partitioned_view_jacobian_with_transpose.get(); + } + Vector parameters; std::unique_ptr<BundleAdjustmentProblem> bal_problem; + std::unique_ptr<PreprocessedProblem> preprocessed_problem; std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian; std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian_with_transpose; std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian; + std::unique_ptr<PartitionedView> partitioned_view_jacobian; + std::unique_ptr<PartitionedView> partitioned_view_jacobian_with_transpose; }; static void Residuals(benchmark::State& state, BALData* data, ContextImpl* context) { - auto problem = data->bal_problem->mutable_problem(); - auto problem_impl = problem->mutable_impl(); - CHECK(problem_impl != nullptr); const int num_threads = state.range(0); Evaluator::Options options; @@ -156,7 +188,9 @@ options.num_eliminate_blocks = 0; std::string error; - auto program = problem_impl->mutable_program(); + CHECK(data->preprocessed_problem != nullptr); + auto program = data->preprocessed_problem->reduced_program.get(); + CHECK(program != nullptr); auto evaluator = Evaluator::Create(options, program, &error); CHECK(evaluator != nullptr); @@ -177,9 +211,6 @@ static void ResidualsAndJacobian(benchmark::State& state, BALData* data, ContextImpl* context) { - auto problem = data->bal_problem->mutable_problem(); - auto problem_impl = problem->mutable_impl(); - CHECK(problem_impl != nullptr); const int num_threads = state.range(0); Evaluator::Options options; @@ -189,7 +220,9 @@ options.num_eliminate_blocks = 0; std::string error; - auto program = problem_impl->mutable_program(); + CHECK(data->preprocessed_problem != nullptr); + auto program = data->preprocessed_problem->reduced_program.get(); + CHECK(program != nullptr); auto evaluator = Evaluator::Create(options, program, &error); CHECK(evaluator != nullptr); @@ -208,6 +241,107 @@ } } +static void PMVRightMultiplyAndAccumulateF(benchmark::State& state, + BALData* data, + ContextImpl* context) { + const int num_threads = state.range(0); + + auto jacobian = data->PartitionedMatrixViewJacobian(context); + + Vector y = Vector::Zero(jacobian->num_rows()); + Vector x = Vector::Random(jacobian->num_cols_f()); + + for (auto _ : state) { + jacobian->RightMultiplyAndAccumulateF( + x.data(), y.data(), context, num_threads); + } + CHECK_GT(y.squaredNorm(), 0.); +} + +static void PMVLeftMultiplyAndAccumulateF(benchmark::State& state, + BALData* data, + ContextImpl* context) { + const int num_threads = state.range(0); + + auto jacobian = data->PartitionedMatrixViewJacobian(context); + + Vector y = Vector::Zero(jacobian->num_cols_f()); + Vector x = Vector::Random(jacobian->num_rows()); + + for (auto _ : state) { + jacobian->LeftMultiplyAndAccumulateF( + x.data(), y.data(), context, num_threads); + } + CHECK_GT(y.squaredNorm(), 0.); +} + +static void PMVLeftMultiplyAndAccumulateWithTransposeF(benchmark::State& state, + BALData* data, + ContextImpl* context) { + const int num_threads = state.range(0); + + auto jacobian = data->PartitionedMatrixViewJacobianWithTranspose(context); + + Vector y = Vector::Zero(jacobian->num_cols_f()); + Vector x = Vector::Random(jacobian->num_rows()); + + for (auto _ : state) { + jacobian->LeftMultiplyAndAccumulateF( + x.data(), y.data(), context, num_threads); + } + CHECK_GT(y.squaredNorm(), 0.); +} +static void PMVRightMultiplyAndAccumulateE(benchmark::State& state, + BALData* data, + ContextImpl* context) { + const int num_threads = state.range(0); + + auto jacobian = data->PartitionedMatrixViewJacobian(context); + + Vector y = Vector::Zero(jacobian->num_rows()); + Vector x = Vector::Random(jacobian->num_cols_e()); + + for (auto _ : state) { + jacobian->RightMultiplyAndAccumulateE( + x.data(), y.data(), context, num_threads); + } + CHECK_GT(y.squaredNorm(), 0.); +} + +static void PMVLeftMultiplyAndAccumulateE(benchmark::State& state, + BALData* data, + ContextImpl* context) { + const int num_threads = state.range(0); + + auto jacobian = data->PartitionedMatrixViewJacobian(context); + + Vector y = Vector::Zero(jacobian->num_cols_e()); + Vector x = Vector::Random(jacobian->num_rows()); + + for (auto _ : state) { + jacobian->LeftMultiplyAndAccumulateE( + x.data(), y.data(), context, num_threads); + } + CHECK_GT(y.squaredNorm(), 0.); +} + +static void PMVLeftMultiplyAndAccumulateWithTransposeE(benchmark::State& state, + BALData* data, + ContextImpl* context) { + const int num_threads = state.range(0); + + auto jacobian = data->PartitionedMatrixViewJacobianWithTranspose(context); + + Vector y = Vector::Zero(jacobian->num_cols_e()); + Vector x = Vector::Random(jacobian->num_rows()); + + for (auto _ : state) { + jacobian->LeftMultiplyAndAccumulateE( + x.data(), y.data(), context, num_threads); + } + CHECK_GT(y.squaredNorm(), 0.); +} + static void JacobianRightMultiplyAndAccumulate(benchmark::State& state, BALData* data, ContextImpl* context) { @@ -349,6 +483,7 @@ ->Arg(4) ->Arg(8) ->Arg(16); + const std::string name_jacobians = "ResidualsAndJacobian<" + path + ">"; ::benchmark::RegisterBenchmark(name_jacobians.c_str(), ceres::internal::ResidualsAndJacobian, @@ -359,6 +494,7 @@ ->Arg(4) ->Arg(8) ->Arg(16); + const std::string name_right_product = "JacobianRightMultiplyAndAccumulate<" + path + ">"; ::benchmark::RegisterBenchmark( @@ -372,6 +508,32 @@ ->Arg(8) ->Arg(16); + const std::string name_right_product_partitioned_f = + "PMVRightMultiplyAndAccumulateF<" + path + ">"; + ::benchmark::RegisterBenchmark( + name_right_product_partitioned_f.c_str(), + ceres::internal::PMVRightMultiplyAndAccumulateF, + data, + &context) + ->Arg(1) + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16); + + const std::string name_right_product_partitioned_e = + "PMVRightMultiplyAndAccumulateE<" + path + ">"; + ::benchmark::RegisterBenchmark( + name_right_product_partitioned_e.c_str(), + ceres::internal::PMVRightMultiplyAndAccumulateE, + data, + &context) + ->Arg(1) + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16); + #ifndef CERES_NO_CUDA const std::string name_right_product_cuda = "JacobianRightMultiplyAndAccumulateCuda<" + path + ">"; @@ -403,8 +565,50 @@ ->Arg(2) ->Arg(4) ->Arg(8) - ->Arg(16) - ->Arg(28); + ->Arg(16); + + const std::string name_left_product_partitioned_f = + "PMVLeftMultiplyAndAccumulateF<" + path + ">"; + ::benchmark::RegisterBenchmark( + name_left_product_partitioned_f.c_str(), + ceres::internal::PMVLeftMultiplyAndAccumulateF, + data, + &context) + ->Arg(1); + const std::string name_left_product_partitioned_transpose_f = + "PMVLeftMultiplyAndAccumulateWithTransposeF<" + path + ">"; + ::benchmark::RegisterBenchmark( + name_left_product_partitioned_transpose_f.c_str(), + ceres::internal::PMVLeftMultiplyAndAccumulateWithTransposeF, + data, + &context) + ->Arg(1) + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16); + + const std::string name_left_product_partitioned_e = + "PMVLeftMultiplyAndAccumulateE<" + path + ">"; + ::benchmark::RegisterBenchmark( + name_left_product_partitioned_e.c_str(), + ceres::internal::PMVLeftMultiplyAndAccumulateE, + data, + &context) + ->Arg(1); + + const std::string name_left_product_partitioned_transpose_e = + "PMVLeftMultiplyAndAccumulateWithTransposeE<" + path + ">"; + ::benchmark::RegisterBenchmark( + name_left_product_partitioned_transpose_e.c_str(), + ceres::internal::PMVLeftMultiplyAndAccumulateWithTransposeE, + data, + &context) + ->Arg(1) + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16); #ifndef CERES_NO_CUDA const std::string name_left_product_cuda =
diff --git a/internal/ceres/implicit_schur_complement.cc b/internal/ceres/implicit_schur_complement.cc index bbd5e4b..6249c8c 100644 --- a/internal/ceres/implicit_schur_complement.cc +++ b/internal/ceres/implicit_schur_complement.cc
@@ -110,7 +110,10 @@ // y2 = E' y1 tmp_e_cols_.setZero(); - A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data()); + A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), + tmp_e_cols_.data(), + options_.context, + options_.num_threads); // y3 = -(E'E)^-1 y2 tmp_e_cols_2_.setZero(); @@ -137,7 +140,8 @@ } // y = y5 + F' y1 - A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), y); + A_->LeftMultiplyAndAccumulateF( + tmp_rows_.data(), y, options_.context, options_.num_threads); } void ImplicitSchurComplement::InversePowerSeriesOperatorRightMultiplyAccumulate( @@ -150,7 +154,10 @@ // y2 = E' y1 tmp_e_cols_.setZero(); - A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data()); + A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), + tmp_e_cols_.data(), + options_.context, + options_.num_threads); // y3 = (E'E)^-1 y2 tmp_e_cols_2_.setZero(); @@ -167,7 +174,10 @@ // y4 = F' y1 tmp_f_cols_.setZero(); - A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), tmp_f_cols_.data()); + A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), + tmp_f_cols_.data(), + options_.context, + options_.num_threads); // y += (F'F)^-1 y4 block_diagonal_FtF_inverse_->RightMultiplyAndAccumulate( @@ -217,7 +227,10 @@ // y3 = E' y2 tmp_e_cols_.setZero(); - A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), tmp_e_cols_.data()); + A_->LeftMultiplyAndAccumulateE(tmp_rows_.data(), + tmp_e_cols_.data(), + options_.context, + options_.num_threads); // y = (E'E)^-1 y3 VectorRef(y, num_cols).setZero(); @@ -241,7 +254,8 @@ void ImplicitSchurComplement::UpdateRhs() { // y1 = E'b tmp_e_cols_.setZero(); - A_->LeftMultiplyAndAccumulateE(b_, tmp_e_cols_.data()); + A_->LeftMultiplyAndAccumulateE( + b_, tmp_e_cols_.data(), options_.context, options_.num_threads); // y2 = (E'E)^-1 y1 Vector y2 = Vector::Zero(A_->num_cols_e()); @@ -258,7 +272,8 @@ // rhs = F' y3 rhs_.setZero(); - A_->LeftMultiplyAndAccumulateF(tmp_rows_.data(), rhs_.data()); + A_->LeftMultiplyAndAccumulateF( + tmp_rows_.data(), rhs_.data(), options_.context, options_.num_threads); } } // namespace ceres::internal
diff --git a/internal/ceres/iterative_schur_complement_solver.cc b/internal/ceres/iterative_schur_complement_solver.cc index db4ad2b..589a1d2 100644 --- a/internal/ceres/iterative_schur_complement_solver.cc +++ b/internal/ceres/iterative_schur_complement_solver.cc
@@ -68,6 +68,9 @@ EventLogger event_logger("IterativeSchurComplementSolver::Solve"); CHECK(A->block_structure() != nullptr); + if (options_.num_threads != 1) { + A->AddTransposeBlockStructure(); + } const int num_eliminate_blocks = options_.elimination_groups[0]; // Initialize a ImplicitSchurComplement object. if (schur_complement_ == nullptr) {
diff --git a/internal/ceres/linear_least_squares_problems.cc b/internal/ceres/linear_least_squares_problems.cc index 6cee2f6..4c55fbc 100644 --- a/internal/ceres/linear_least_squares_problems.cc +++ b/internal/ceres/linear_least_squares_problems.cc
@@ -63,6 +63,8 @@ return LinearLeastSquaresProblem4(); case 5: return LinearLeastSquaresProblem5(); + case 6: + return LinearLeastSquaresProblem6(); default: LOG(FATAL) << "Unknown problem id requested " << id; } @@ -798,6 +800,133 @@ return problem; } +/* + A = [1 2 0 0 0 1 1 + 1 4 0 0 0 5 6 + 3 4 0 0 0 7 8 + 5 6 0 0 0 9 0 + 0 0 9 0 0 3 1] + + b = [0 + 1 + 2 + 3 + 4] +*/ +// BlockSparseMatrix version +// +// This problem has the unique property that it has two different +// sized f-blocks, but only one of them occurs in the rows involving +// the one e-block. So performing Schur elimination on this problem +// tests the Schur Eliminator's ability to handle non-e-block rows +// correctly when their structure does not conform to the static +// structure determined by DetectStructure. +// +// Additionally, this problem has the first row of the last row block of E being +// larger than number of row blocks in E +// +// NOTE: This problem is too small and rank deficient to be solved without +// the diagonal regularization. +std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem6() { + int num_rows = 5; + int num_cols = 7; + + auto problem = std::make_unique<LinearLeastSquaresProblem>(); + + problem->b = std::make_unique<double[]>(num_rows); + problem->D = std::make_unique<double[]>(num_cols); + problem->num_eliminate_blocks = 1; + + auto* bs = new CompressedRowBlockStructure; + auto values = std::make_unique<double[]>(num_rows * num_cols); + + // Column block structure + bs->cols.emplace_back(); + bs->cols.back().size = 2; + bs->cols.back().position = 0; + + bs->cols.emplace_back(); + bs->cols.back().size = 3; + bs->cols.back().position = 2; + + bs->cols.emplace_back(); + bs->cols.back().size = 2; + bs->cols.back().position = 5; + + int nnz = 0; + + // Row 1 & 2 + { + bs->rows.emplace_back(); + CompressedRow& row = bs->rows.back(); + row.block.size = 2; + row.block.position = 0; + + row.cells.emplace_back(0, nnz); + values[nnz++] = 1; + values[nnz++] = 2; + values[nnz++] = 1; + values[nnz++] = 4; + + row.cells.emplace_back(2, nnz); + values[nnz++] = 1; + values[nnz++] = 1; + values[nnz++] = 5; + values[nnz++] = 6; + } + + // Row 3 & 4 + { + bs->rows.emplace_back(); + CompressedRow& row = bs->rows.back(); + row.block.size = 2; + row.block.position = 2; + + row.cells.emplace_back(0, nnz); + values[nnz++] = 3; + values[nnz++] = 4; + values[nnz++] = 5; + values[nnz++] = 6; + + row.cells.emplace_back(2, nnz); + values[nnz++] = 7; + values[nnz++] = 8; + values[nnz++] = 9; + values[nnz++] = 0; + } + + // Row 5 + { + bs->rows.emplace_back(); + CompressedRow& row = bs->rows.back(); + row.block.size = 1; + row.block.position = 4; + + row.cells.emplace_back(1, nnz); + values[nnz++] = 9; + values[nnz++] = 0; + values[nnz++] = 0; + + row.cells.emplace_back(2, nnz); + values[nnz++] = 3; + values[nnz++] = 1; + } + + auto A = std::make_unique<BlockSparseMatrix>(bs); + memcpy(A->mutable_values(), values.get(), nnz * sizeof(*A->values())); + + for (int i = 0; i < num_cols; ++i) { + problem->D.get()[i] = (i + 1) * 100; + } + + for (int i = 0; i < num_rows; ++i) { + problem->b.get()[i] = i; + } + + problem->A = std::move(A); + return problem; +} + namespace { bool DumpLinearLeastSquaresProblemToConsole(const SparseMatrix* A, const double* D,
diff --git a/internal/ceres/linear_least_squares_problems.h b/internal/ceres/linear_least_squares_problems.h index 0e14517..c56a338 100644 --- a/internal/ceres/linear_least_squares_problems.h +++ b/internal/ceres/linear_least_squares_problems.h
@@ -75,6 +75,8 @@ std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem4(); CERES_NO_EXPORT std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem5(); +CERES_NO_EXPORT +std::unique_ptr<LinearLeastSquaresProblem> LinearLeastSquaresProblem6(); // Write the linear least squares problem to disk. The exact format // depends on dump_format_type.
diff --git a/internal/ceres/partitioned_matrix_view.h b/internal/ceres/partitioned_matrix_view.h index 86106da..533a0e5 100644 --- a/internal/ceres/partitioned_matrix_view.h +++ b/internal/ceres/partitioned_matrix_view.h
@@ -70,9 +70,17 @@ // y += E'x virtual void LeftMultiplyAndAccumulateE(const double* x, double* y) const = 0; + virtual void LeftMultiplyAndAccumulateE(const double* x, + double* y, + ContextImpl* context, + int num_threads) const = 0; // y += F'x virtual void LeftMultiplyAndAccumulateF(const double* x, double* y) const = 0; + virtual void LeftMultiplyAndAccumulateF(const double* x, + double* y, + ContextImpl* context, + int num_threads) const = 0; // y += Ex virtual void RightMultiplyAndAccumulateE(const double* x, @@ -138,6 +146,14 @@ void LeftMultiplyAndAccumulateE(const double* x, double* y) const final; void LeftMultiplyAndAccumulateF(const double* x, double* y) const final; + void LeftMultiplyAndAccumulateE(const double* x, + double* y, + ContextImpl* context, + int num_threads) const final; + void LeftMultiplyAndAccumulateF(const double* x, + double* y, + ContextImpl* context, + int num_threads) const final; void RightMultiplyAndAccumulateE(const double* x, double* y) const final; void RightMultiplyAndAccumulateF(const double* x, double* y) const final; void RightMultiplyAndAccumulateE(const double* x,
diff --git a/internal/ceres/partitioned_matrix_view_impl.h b/internal/ceres/partitioned_matrix_view_impl.h index fea0a7d..689f09e 100644 --- a/internal/ceres/partitioned_matrix_view_impl.h +++ b/internal/ceres/partitioned_matrix_view_impl.h
@@ -188,6 +188,9 @@ template <int kRowBlockSize, int kEBlockSize, int kFBlockSize> void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>:: LeftMultiplyAndAccumulateE(const double* x, double* y) const { + if (!num_col_blocks_e_) return; + if (!num_row_blocks_e_) return; + const CompressedRowBlockStructure* bs = matrix_.block_structure(); // Iterate over the first num_row_blocks_e_ row blocks, and multiply @@ -211,7 +214,54 @@ template <int kRowBlockSize, int kEBlockSize, int kFBlockSize> void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>:: + LeftMultiplyAndAccumulateE(const double* x, + double* y, + ContextImpl* context, + int num_threads) const { + if (!num_col_blocks_e_) return; + if (!num_row_blocks_e_) return; + + auto transpose_bs = matrix_.transpose_block_structure(); + if (transpose_bs == nullptr || num_threads == 1) { + LeftMultiplyAndAccumulateE(x, y); + return; + } + + // Local copies of class members in order to avoid capturing pointer to the + // whole object in lambda function + auto values = matrix_.values(); + const int num_row_blocks_e = num_row_blocks_e_; + ParallelFor( + context, + 0, + num_col_blocks_e_, + num_threads, + [values, transpose_bs, num_row_blocks_e, x, y](int row_block_id) { + int row_block_pos = transpose_bs->rows[row_block_id].block.position; + int row_block_size = transpose_bs->rows[row_block_id].block.size; + auto& cells = transpose_bs->rows[row_block_id].cells; + + for (auto& cell : cells) { + const int col_block_id = cell.block_id; + const int col_block_size = transpose_bs->cols[col_block_id].size; + const int col_block_pos = transpose_bs->cols[col_block_id].position; + if (col_block_id >= num_row_blocks_e) break; + MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>( + values + cell.position, + col_block_size, + row_block_size, + x + col_block_pos, + y + row_block_pos); + } + }, + transpose_bs->rows.data(), + [](const CompressedRow& row) { return row.cumulative_nnz; }); +} + +template <int kRowBlockSize, int kEBlockSize, int kFBlockSize> +void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>:: LeftMultiplyAndAccumulateF(const double* x, double* y) const { + if (!num_col_blocks_f_) return; const CompressedRowBlockStructure* bs = matrix_.block_structure(); // Iterate over row blocks, and if the row block is in E, then @@ -255,6 +305,67 @@ } } +template <int kRowBlockSize, int kEBlockSize, int kFBlockSize> +void PartitionedMatrixView<kRowBlockSize, kEBlockSize, kFBlockSize>:: + LeftMultiplyAndAccumulateF(const double* x, + double* y, + ContextImpl* context, + int num_threads) const { + if (!num_col_blocks_f_) return; + auto transpose_bs = matrix_.transpose_block_structure(); + if (transpose_bs == nullptr || num_threads == 1) { + LeftMultiplyAndAccumulateF(x, y); + return; + } + // Local copies of class members in order to avoid capturing pointer to the + // whole object in lambda function + auto values = matrix_.values(); + const int num_row_blocks_e = num_row_blocks_e_; + const int num_cols_e = num_cols_e_; + ParallelFor( + context, + num_col_blocks_e_, + num_col_blocks_e_ + num_col_blocks_f_, + num_threads, + [values, transpose_bs, num_row_blocks_e, num_cols_e, x, y]( + int row_block_id) { + int row_block_pos = transpose_bs->rows[row_block_id].block.position; + int row_block_size = transpose_bs->rows[row_block_id].block.size; + auto& cells = transpose_bs->rows[row_block_id].cells; + + const int num_cells = cells.size(); + int cell_idx = 0; + for (; cell_idx < num_cells; ++cell_idx) { + auto& cell = cells[cell_idx]; + const int col_block_id = cell.block_id; + const int col_block_size = transpose_bs->cols[col_block_id].size; + const int col_block_pos = transpose_bs->cols[col_block_id].position; + if (col_block_id >= num_row_blocks_e) break; + + MatrixTransposeVectorMultiply<kRowBlockSize, kFBlockSize, 1>( + values + cell.position, + col_block_size, + row_block_size, + x + col_block_pos, + y + row_block_pos - num_cols_e); + } + for (; cell_idx < num_cells; ++cell_idx) { + auto& cell = cells[cell_idx]; + const int col_block_id = cell.block_id; + const int col_block_size = transpose_bs->cols[col_block_id].size; + const int col_block_pos = transpose_bs->cols[col_block_id].position; + MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>( + values + cell.position, + col_block_size, + row_block_size, + x + col_block_pos, + y + row_block_pos - num_cols_e); + } + }, + transpose_bs->rows.data(), + [](const CompressedRow& row) { return row.cumulative_nnz; }); +} + // Given a range of columns blocks of a matrix m, compute the block // structure of the block diagonal of the matrix m(:, // start_col_block:end_col_block)'m(:, start_col_block:end_col_block)
diff --git a/internal/ceres/partitioned_matrix_view_test.cc b/internal/ceres/partitioned_matrix_view_test.cc index b11ea8e..4653e06 100644 --- a/internal/ceres/partitioned_matrix_view_test.cc +++ b/internal/ceres/partitioned_matrix_view_test.cc
@@ -177,30 +177,40 @@ const int kMaxNumThreads = 8; class PartitionedMatrixViewParallelTest : public ::testing::TestWithParam<int> { protected: + static const int kNumProblems = 3; void SetUp() final { - std::unique_ptr<LinearLeastSquaresProblem> problem = - CreateLinearLeastSquaresProblemFromId(2); - CHECK(problem != nullptr); - A_ = std::move(problem->A); + int problem_ids[kNumProblems] = {2, 4, 6}; + for (int i = 0; i < kNumProblems; ++i) { + auto problem = CreateLinearLeastSquaresProblemFromId(problem_ids[i]); + CHECK(problem != nullptr); + SetUpMatrix(i, problem.get()); + } - num_cols_ = A_->num_cols(); - num_rows_ = A_->num_rows(); - num_eliminate_blocks_ = problem->num_eliminate_blocks; - LinearSolver::Options options; - options.elimination_groups.push_back(num_eliminate_blocks_); - pmv_ = PartitionedMatrixViewBase::Create( - options, *down_cast<BlockSparseMatrix*>(A_.get())); context_.EnsureMinimumThreads(kMaxNumThreads); } + void SetUpMatrix(int id, LinearLeastSquaresProblem* problem) { + A_[id] = std::move(problem->A); + auto& A = A_[id]; + auto block_sparse = down_cast<BlockSparseMatrix*>(A.get()); + block_sparse->AddTransposeBlockStructure(); + + num_cols_[id] = A->num_cols(); + num_rows_[id] = A->num_rows(); + num_eliminate_blocks_[id] = problem->num_eliminate_blocks; + LinearSolver::Options options; + options.elimination_groups.push_back(num_eliminate_blocks_[id]); + pmv_[id] = PartitionedMatrixViewBase::Create(options, *block_sparse); + } + double RandDouble() { return distribution_(prng_); } ContextImpl context_; - int num_rows_; - int num_cols_; - int num_eliminate_blocks_; - std::unique_ptr<SparseMatrix> A_; - std::unique_ptr<PartitionedMatrixViewBase> pmv_; + int num_rows_[kNumProblems]; + int num_cols_[kNumProblems]; + int num_eliminate_blocks_[kNumProblems]; + std::unique_ptr<SparseMatrix> A_[kNumProblems]; + std::unique_ptr<PartitionedMatrixViewBase> pmv_[kNumProblems]; std::mt19937 prng_; std::uniform_real_distribution<double> distribution_ = std::uniform_real_distribution<double>(0.0, 1.0); @@ -208,23 +218,83 @@ TEST_P(PartitionedMatrixViewParallelTest, RightMultiplyAndAccumulateEParallel) { const int kNumThreads = GetParam(); - Vector x1(pmv_->num_cols_e()); - Vector x2(pmv_->num_cols()); - x2.setZero(); + for (int p = 0; p < kNumProblems; ++p) { + auto& pmv = pmv_[p]; + auto& A = A_[p]; - for (int i = 0; i < pmv_->num_cols_e(); ++i) { - x1(i) = x2(i) = RandDouble(); + Vector x1(pmv->num_cols_e()); + Vector x2(pmv->num_cols()); + x2.setZero(); + + for (int i = 0; i < pmv->num_cols_e(); ++i) { + x1(i) = x2(i) = RandDouble(); + } + + Vector y1 = Vector::Zero(pmv->num_rows()); + pmv->RightMultiplyAndAccumulateE( + x1.data(), y1.data(), &context_, kNumThreads); + + Vector y2 = Vector::Zero(pmv->num_rows()); + A->RightMultiplyAndAccumulate(x2.data(), y2.data()); + + for (int i = 0; i < pmv->num_rows(); ++i) { + EXPECT_NEAR(y1(i), y2(i), kEpsilon); + } } +} - Vector y1 = Vector::Zero(pmv_->num_rows()); - pmv_->RightMultiplyAndAccumulateE( - x1.data(), y1.data(), &context_, kNumThreads); +TEST_P(PartitionedMatrixViewParallelTest, RightMultiplyAndAccumulateFParallel) { + const int kNumThreads = GetParam(); + for (int p = 0; p < kNumProblems; ++p) { + auto& pmv = pmv_[p]; + auto& A = A_[p]; + Vector x1(pmv->num_cols_f()); + Vector x2(pmv->num_cols()); + x2.setZero(); - Vector y2 = Vector::Zero(pmv_->num_rows()); - A_->RightMultiplyAndAccumulate(x2.data(), y2.data()); + for (int i = 0; i < pmv->num_cols_f(); ++i) { + x1(i) = x2(i + pmv->num_cols_e()) = RandDouble(); + } - for (int i = 0; i < pmv_->num_rows(); ++i) { - EXPECT_NEAR(y1(i), y2(i), kEpsilon); + Vector y1 = Vector::Zero(pmv->num_rows()); + pmv->RightMultiplyAndAccumulateF( + x1.data(), y1.data(), &context_, kNumThreads); + + Vector y2 = Vector::Zero(pmv->num_rows()); + A->RightMultiplyAndAccumulate(x2.data(), y2.data()); + + for (int i = 0; i < pmv->num_rows(); ++i) { + EXPECT_NEAR(y1(i), y2(i), kEpsilon); + } + } +} + +TEST_P(PartitionedMatrixViewParallelTest, LeftMultiplyAndAccumulateParallel) { + const int kNumThreads = GetParam(); + for (int p = 0; p < kNumProblems; ++p) { + auto& pmv = pmv_[p]; + auto& A = A_[p]; + Vector x = Vector::Zero(pmv->num_rows()); + for (int i = 0; i < pmv->num_rows(); ++i) { + x(i) = RandDouble(); + } + Vector x_pre = x; + + Vector y = Vector::Zero(pmv->num_cols()); + Vector y1 = Vector::Zero(pmv->num_cols_e()); + Vector y2 = Vector::Zero(pmv->num_cols_f()); + + A->LeftMultiplyAndAccumulate(x.data(), y.data()); + pmv->LeftMultiplyAndAccumulateE( + x.data(), y1.data(), &context_, kNumThreads); + pmv->LeftMultiplyAndAccumulateF( + x.data(), y2.data(), &context_, kNumThreads); + + for (int i = 0; i < pmv->num_cols(); ++i) { + EXPECT_NEAR(y(i), + (i < pmv->num_cols_e()) ? y1(i) : y2(i - pmv->num_cols_e()), + kEpsilon); + } } }