blob: c679885f0115a4d2595ca3be3985eb0cf0ff074c [file] [log] [blame]
// Ceres Solver - A fast non-linear least squares minimizer
// Copyright 2023 Google Inc. All rights reserved.
// http://ceres-solver.org/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * Neither the name of Google Inc. nor the names of its contributors may be
// used to endorse or promote products derived from this software without
// specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
#include <memory>
#include <random>
#include <string>
#include <vector>
#include "benchmark/benchmark.h"
#include "ceres/block_sparse_matrix.h"
#include "ceres/bundle_adjustment_test_util.h"
#include "ceres/cuda_block_sparse_crs_view.h"
#include "ceres/cuda_partitioned_block_sparse_crs_view.h"
#include "ceres/cuda_sparse_matrix.h"
#include "ceres/cuda_vector.h"
#include "ceres/evaluator.h"
#include "ceres/implicit_schur_complement.h"
#include "ceres/partitioned_matrix_view.h"
#include "ceres/power_series_expansion_preconditioner.h"
#include "ceres/preprocessor.h"
#include "ceres/problem.h"
#include "ceres/problem_impl.h"
#include "ceres/program.h"
#include "ceres/sparse_matrix.h"
namespace ceres::internal {
template <typename Derived, typename Base>
std::unique_ptr<Derived> downcast_unique_ptr(std::unique_ptr<Base>& base) {
return std::unique_ptr<Derived>(dynamic_cast<Derived*>(base.release()));
}
// Benchmark library might invoke benchmark function multiple times.
// In order to save time required to parse BAL data, we ensure that
// each dataset is being loaded at most once.
// Each type of jacobians is also cached after first creation
struct BALData {
using PartitionedView = PartitionedMatrixView<2, 3, 9>;
explicit BALData(const std::string& path) {
bal_problem = std::make_unique<BundleAdjustmentProblem>(path);
CHECK(bal_problem != nullptr);
auto problem_impl = bal_problem->mutable_problem()->mutable_impl();
auto preprocessor = Preprocessor::Create(MinimizerType::TRUST_REGION);
preprocessed_problem = std::make_unique<PreprocessedProblem>();
Solver::Options options = bal_problem->options();
options.linear_solver_type = ITERATIVE_SCHUR;
CHECK(preprocessor->Preprocess(
options, problem_impl, preprocessed_problem.get()));
auto program = preprocessed_problem->reduced_program.get();
parameters.resize(program->NumParameters());
program->ParameterBlocksToStateVector(parameters.data());
const int num_residuals = program->NumResiduals();
b.resize(num_residuals);
std::mt19937 rng;
std::normal_distribution<double> rnorm;
for (int i = 0; i < num_residuals; ++i) {
b[i] = rnorm(rng);
}
const int num_parameters = program->NumParameters();
D.resize(num_parameters);
for (int i = 0; i < num_parameters; ++i) {
D[i] = rnorm(rng);
}
}
std::unique_ptr<BlockSparseMatrix> CreateBlockSparseJacobian(
ContextImpl* context, bool sequential) {
auto problem = bal_problem->mutable_problem();
auto problem_impl = problem->mutable_impl();
CHECK(problem_impl != nullptr);
Evaluator::Options options;
options.linear_solver_type = ITERATIVE_SCHUR;
options.num_threads = 1;
options.context = context;
options.num_eliminate_blocks = bal_problem->num_points();
std::string error;
auto program = preprocessed_problem->reduced_program.get();
auto evaluator = Evaluator::Create(options, program, &error);
CHECK(evaluator != nullptr);
auto jacobian = evaluator->CreateJacobian();
auto block_sparse = downcast_unique_ptr<BlockSparseMatrix>(jacobian);
CHECK(block_sparse != nullptr);
if (sequential) {
auto block_structure_sequential =
std::make_unique<CompressedRowBlockStructure>(
*block_sparse->block_structure());
int num_nonzeros = 0;
for (auto& row_block : block_structure_sequential->rows) {
const int row_block_size = row_block.block.size;
for (auto& cell : row_block.cells) {
const int col_block_size =
block_structure_sequential->cols[cell.block_id].size;
cell.position = num_nonzeros;
num_nonzeros += col_block_size * row_block_size;
}
}
block_sparse = std::make_unique<BlockSparseMatrix>(
block_structure_sequential.release(),
#ifndef CERES_NO_CUDA
true
#else
false
#endif
);
}
std::mt19937 rng;
std::normal_distribution<double> rnorm;
const int nnz = block_sparse->num_nonzeros();
auto values = block_sparse->mutable_values();
for (int i = 0; i < nnz; ++i) {
values[i] = rnorm(rng);
}
return block_sparse;
}
std::unique_ptr<CompressedRowSparseMatrix> CreateCompressedRowSparseJacobian(
ContextImpl* context) {
auto block_sparse = BlockSparseJacobian(context);
return block_sparse->ToCompressedRowSparseMatrix();
}
const BlockSparseMatrix* BlockSparseJacobian(ContextImpl* context) {
if (!block_sparse_jacobian) {
block_sparse_jacobian = CreateBlockSparseJacobian(context, true);
}
return block_sparse_jacobian.get();
}
const BlockSparseMatrix* BlockSparseJacobianPartitioned(
ContextImpl* context) {
if (!block_sparse_jacobian_partitioned) {
block_sparse_jacobian_partitioned =
CreateBlockSparseJacobian(context, false);
}
return block_sparse_jacobian_partitioned.get();
}
const CompressedRowSparseMatrix* CompressedRowSparseJacobian(
ContextImpl* context) {
if (!crs_jacobian) {
crs_jacobian = CreateCompressedRowSparseJacobian(context);
}
return crs_jacobian.get();
}
std::unique_ptr<PartitionedView> PartitionedMatrixViewJacobian(
const LinearSolver::Options& options) {
auto block_sparse = BlockSparseJacobianPartitioned(options.context);
return std::make_unique<PartitionedView>(options, *block_sparse);
}
BlockSparseMatrix* BlockDiagonalEtE(const LinearSolver::Options& options) {
if (!block_diagonal_ete) {
auto partitioned_view = PartitionedMatrixViewJacobian(options);
block_diagonal_ete = partitioned_view->CreateBlockDiagonalEtE();
}
return block_diagonal_ete.get();
}
BlockSparseMatrix* BlockDiagonalFtF(const LinearSolver::Options& options) {
if (!block_diagonal_ftf) {
auto partitioned_view = PartitionedMatrixViewJacobian(options);
block_diagonal_ftf = partitioned_view->CreateBlockDiagonalFtF();
}
return block_diagonal_ftf.get();
}
const ImplicitSchurComplement* ImplicitSchurComplementWithoutDiagonal(
const LinearSolver::Options& options) {
auto block_sparse = BlockSparseJacobianPartitioned(options.context);
implicit_schur_complement =
std::make_unique<ImplicitSchurComplement>(options);
implicit_schur_complement->Init(*block_sparse, nullptr, b.data());
return implicit_schur_complement.get();
}
const ImplicitSchurComplement* ImplicitSchurComplementWithDiagonal(
const LinearSolver::Options& options) {
auto block_sparse = BlockSparseJacobianPartitioned(options.context);
implicit_schur_complement_diag =
std::make_unique<ImplicitSchurComplement>(options);
implicit_schur_complement_diag->Init(*block_sparse, D.data(), b.data());
return implicit_schur_complement_diag.get();
}
Vector parameters;
Vector D;
Vector b;
std::unique_ptr<BundleAdjustmentProblem> bal_problem;
std::unique_ptr<PreprocessedProblem> preprocessed_problem;
std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian_partitioned;
std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian;
std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian;
std::unique_ptr<BlockSparseMatrix> block_diagonal_ete;
std::unique_ptr<BlockSparseMatrix> block_diagonal_ftf;
std::unique_ptr<ImplicitSchurComplement> implicit_schur_complement;
std::unique_ptr<ImplicitSchurComplement> implicit_schur_complement_diag;
};
static void Residuals(benchmark::State& state,
BALData* data,
ContextImpl* context) {
const int num_threads = static_cast<int>(state.range(0));
Evaluator::Options options;
options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
options.num_threads = num_threads;
options.context = context;
options.num_eliminate_blocks = 0;
std::string error;
CHECK(data->preprocessed_problem != nullptr);
auto program = data->preprocessed_problem->reduced_program.get();
CHECK(program != nullptr);
auto evaluator = Evaluator::Create(options, program, &error);
CHECK(evaluator != nullptr);
double cost = 0.;
Vector residuals = Vector::Zero(program->NumResiduals());
Evaluator::EvaluateOptions eval_options;
for (auto _ : state) {
CHECK(evaluator->Evaluate(eval_options,
data->parameters.data(),
&cost,
residuals.data(),
nullptr,
nullptr));
}
}
static void ResidualsAndJacobian(benchmark::State& state,
BALData* data,
ContextImpl* context) {
const int num_threads = static_cast<int>(state.range(0));
Evaluator::Options options;
options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
options.num_threads = num_threads;
options.context = context;
options.num_eliminate_blocks = 0;
std::string error;
CHECK(data->preprocessed_problem != nullptr);
auto program = data->preprocessed_problem->reduced_program.get();
CHECK(program != nullptr);
auto evaluator = Evaluator::Create(options, program, &error);
CHECK(evaluator != nullptr);
double cost = 0.;
Vector residuals = Vector::Zero(program->NumResiduals());
auto jacobian = evaluator->CreateJacobian();
Evaluator::EvaluateOptions eval_options;
for (auto _ : state) {
CHECK(evaluator->Evaluate(eval_options,
data->parameters.data(),
&cost,
residuals.data(),
nullptr,
jacobian.get()));
}
}
static void Plus(benchmark::State& state, BALData* data, ContextImpl* context) {
const int num_threads = static_cast<int>(state.range(0));
Evaluator::Options options;
options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
options.num_threads = num_threads;
options.context = context;
options.num_eliminate_blocks = 0;
std::string error;
CHECK(data->preprocessed_problem != nullptr);
auto program = data->preprocessed_problem->reduced_program.get();
CHECK(program != nullptr);
auto evaluator = Evaluator::Create(options, program, &error);
CHECK(evaluator != nullptr);
Vector state_plus_delta = Vector::Zero(program->NumParameters());
Vector delta = Vector::Random(program->NumEffectiveParameters());
for (auto _ : state) {
CHECK(evaluator->Plus(
data->parameters.data(), delta.data(), state_plus_delta.data()));
}
CHECK_GT(state_plus_delta.squaredNorm(), 0.);
}
static void PSEPreconditioner(benchmark::State& state,
BALData* data,
ContextImpl* context) {
LinearSolver::Options options;
options.num_threads = static_cast<int>(state.range(0));
options.elimination_groups.push_back(data->bal_problem->num_points());
options.context = context;
auto jacobian = data->ImplicitSchurComplementWithDiagonal(options);
Preconditioner::Options preconditioner_options(options);
PowerSeriesExpansionPreconditioner preconditioner(
jacobian, 10, 0, preconditioner_options);
Vector y = Vector::Zero(jacobian->num_cols());
Vector x = Vector::Random(jacobian->num_cols());
for (auto _ : state) {
preconditioner.RightMultiplyAndAccumulate(x.data(), y.data());
}
CHECK_GT(y.squaredNorm(), 0.);
}
static void PMVRightMultiplyAndAccumulateF(benchmark::State& state,
BALData* data,
ContextImpl* context) {
LinearSolver::Options options;
options.num_threads = static_cast<int>(state.range(0));
options.elimination_groups.push_back(data->bal_problem->num_points());
options.context = context;
auto jacobian = data->PartitionedMatrixViewJacobian(options);
Vector y = Vector::Zero(jacobian->num_rows());
Vector x = Vector::Random(jacobian->num_cols_f());
for (auto _ : state) {
jacobian->RightMultiplyAndAccumulateF(x.data(), y.data());
}
CHECK_GT(y.squaredNorm(), 0.);
}
static void PMVLeftMultiplyAndAccumulateF(benchmark::State& state,
BALData* data,
ContextImpl* context) {
LinearSolver::Options options;
options.num_threads = static_cast<int>(state.range(0));
options.elimination_groups.push_back(data->bal_problem->num_points());
options.context = context;
auto jacobian = data->PartitionedMatrixViewJacobian(options);
Vector y = Vector::Zero(jacobian->num_cols_f());
Vector x = Vector::Random(jacobian->num_rows());
for (auto _ : state) {
jacobian->LeftMultiplyAndAccumulateF(x.data(), y.data());
}
CHECK_GT(y.squaredNorm(), 0.);
}
static void PMVRightMultiplyAndAccumulateE(benchmark::State& state,
BALData* data,
ContextImpl* context) {
LinearSolver::Options options;
options.num_threads = static_cast<int>(state.range(0));
options.elimination_groups.push_back(data->bal_problem->num_points());
options.context = context;
auto jacobian = data->PartitionedMatrixViewJacobian(options);
Vector y = Vector::Zero(jacobian->num_rows());
Vector x = Vector::Random(jacobian->num_cols_e());
for (auto _ : state) {
jacobian->RightMultiplyAndAccumulateE(x.data(), y.data());
}
CHECK_GT(y.squaredNorm(), 0.);
}
static void PMVLeftMultiplyAndAccumulateE(benchmark::State& state,
BALData* data,
ContextImpl* context) {
LinearSolver::Options options;
options.num_threads = static_cast<int>(state.range(0));
options.elimination_groups.push_back(data->bal_problem->num_points());
options.context = context;
auto jacobian = data->PartitionedMatrixViewJacobian(options);
Vector y = Vector::Zero(jacobian->num_cols_e());
Vector x = Vector::Random(jacobian->num_rows());
for (auto _ : state) {
jacobian->LeftMultiplyAndAccumulateE(x.data(), y.data());
}
CHECK_GT(y.squaredNorm(), 0.);
}
static void PMVUpdateBlockDiagonalEtE(benchmark::State& state,
BALData* data,
ContextImpl* context) {
LinearSolver::Options options;
options.num_threads = static_cast<int>(state.range(0));
options.elimination_groups.push_back(data->bal_problem->num_points());
options.context = context;
auto jacobian = data->PartitionedMatrixViewJacobian(options);
auto block_diagonal_ete = data->BlockDiagonalEtE(options);
for (auto _ : state) {
jacobian->UpdateBlockDiagonalEtE(block_diagonal_ete);
}
}
static void PMVUpdateBlockDiagonalFtF(benchmark::State& state,
BALData* data,
ContextImpl* context) {
LinearSolver::Options options;
options.num_threads = static_cast<int>(state.range(0));
options.elimination_groups.push_back(data->bal_problem->num_points());
options.context = context;
auto jacobian = data->PartitionedMatrixViewJacobian(options);
auto block_diagonal_ftf = data->BlockDiagonalFtF(options);
for (auto _ : state) {
jacobian->UpdateBlockDiagonalFtF(block_diagonal_ftf);
}
}
static void ISCRightMultiplyNoDiag(benchmark::State& state,
BALData* data,
ContextImpl* context) {
LinearSolver::Options options;
options.num_threads = static_cast<int>(state.range(0));
options.elimination_groups.push_back(data->bal_problem->num_points());
options.context = context;
auto jacobian = data->ImplicitSchurComplementWithoutDiagonal(options);
Vector y = Vector::Zero(jacobian->num_rows());
Vector x = Vector::Random(jacobian->num_cols());
for (auto _ : state) {
jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
}
CHECK_GT(y.squaredNorm(), 0.);
}
static void ISCRightMultiplyDiag(benchmark::State& state,
BALData* data,
ContextImpl* context) {
LinearSolver::Options options;
options.num_threads = static_cast<int>(state.range(0));
options.elimination_groups.push_back(data->bal_problem->num_points());
options.context = context;
auto jacobian = data->ImplicitSchurComplementWithDiagonal(options);
Vector y = Vector::Zero(jacobian->num_rows());
Vector x = Vector::Random(jacobian->num_cols());
for (auto _ : state) {
jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
}
CHECK_GT(y.squaredNorm(), 0.);
}
static void JacobianToCRS(benchmark::State& state,
BALData* data,
ContextImpl* context) {
auto jacobian = data->BlockSparseJacobian(context);
std::unique_ptr<CompressedRowSparseMatrix> matrix;
for (auto _ : state) {
matrix = jacobian->ToCompressedRowSparseMatrix();
}
CHECK(matrix != nullptr);
}
#ifndef CERES_NO_CUDA
static void PMVRightMultiplyAndAccumulateFCuda(benchmark::State& state,
BALData* data,
ContextImpl* context) {
LinearSolver::Options options;
options.elimination_groups.push_back(data->bal_problem->num_points());
options.context = context;
options.num_threads = 1;
auto jacobian = data->PartitionedMatrixViewJacobian(options);
auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
CudaPartitionedBlockSparseCRSView view(
*underlying_matrix, jacobian->num_col_blocks_e(), context);
Vector x = Vector::Random(jacobian->num_cols_f());
CudaVector cuda_x(context, x.size());
CudaVector cuda_y(context, jacobian->num_rows());
cuda_x.CopyFromCpu(x);
cuda_y.SetZero();
auto matrix = view.matrix_f();
for (auto _ : state) {
matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
}
CHECK_GT(cuda_y.Norm(), 0.);
}
static void PMVLeftMultiplyAndAccumulateFCuda(benchmark::State& state,
BALData* data,
ContextImpl* context) {
LinearSolver::Options options;
options.elimination_groups.push_back(data->bal_problem->num_points());
options.context = context;
options.num_threads = 1;
auto jacobian = data->PartitionedMatrixViewJacobian(options);
auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
CudaPartitionedBlockSparseCRSView view(
*underlying_matrix, jacobian->num_col_blocks_e(), context);
Vector x = Vector::Random(jacobian->num_rows());
CudaVector cuda_x(context, x.size());
CudaVector cuda_y(context, jacobian->num_cols_f());
cuda_x.CopyFromCpu(x);
cuda_y.SetZero();
auto matrix = view.matrix_f();
for (auto _ : state) {
matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
}
CHECK_GT(cuda_y.Norm(), 0.);
}
static void PMVRightMultiplyAndAccumulateECuda(benchmark::State& state,
BALData* data,
ContextImpl* context) {
LinearSolver::Options options;
options.elimination_groups.push_back(data->bal_problem->num_points());
options.context = context;
options.num_threads = 1;
auto jacobian = data->PartitionedMatrixViewJacobian(options);
auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
CudaPartitionedBlockSparseCRSView view(
*underlying_matrix, jacobian->num_col_blocks_e(), context);
Vector x = Vector::Random(jacobian->num_cols_e());
CudaVector cuda_x(context, x.size());
CudaVector cuda_y(context, jacobian->num_rows());
cuda_x.CopyFromCpu(x);
cuda_y.SetZero();
auto matrix = view.matrix_e();
for (auto _ : state) {
matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
}
CHECK_GT(cuda_y.Norm(), 0.);
}
static void PMVLeftMultiplyAndAccumulateECuda(benchmark::State& state,
BALData* data,
ContextImpl* context) {
LinearSolver::Options options;
options.elimination_groups.push_back(data->bal_problem->num_points());
options.context = context;
options.num_threads = 1;
auto jacobian = data->PartitionedMatrixViewJacobian(options);
auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
CudaPartitionedBlockSparseCRSView view(
*underlying_matrix, jacobian->num_col_blocks_e(), context);
Vector x = Vector::Random(jacobian->num_rows());
CudaVector cuda_x(context, x.size());
CudaVector cuda_y(context, jacobian->num_cols_e());
cuda_x.CopyFromCpu(x);
cuda_y.SetZero();
auto matrix = view.matrix_e();
for (auto _ : state) {
matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
}
CHECK_GT(cuda_y.Norm(), 0.);
}
// We want CudaBlockSparseCRSView to be not slower than explicit conversion to
// CRS on CPU
static void JacobianToCRSView(benchmark::State& state,
BALData* data,
ContextImpl* context) {
auto jacobian = data->BlockSparseJacobian(context);
std::unique_ptr<CudaBlockSparseCRSView> matrix;
for (auto _ : state) {
matrix = std::make_unique<CudaBlockSparseCRSView>(*jacobian, context);
}
CHECK(matrix != nullptr);
}
static void JacobianToCRSMatrix(benchmark::State& state,
BALData* data,
ContextImpl* context) {
auto jacobian = data->BlockSparseJacobian(context);
std::unique_ptr<CudaSparseMatrix> matrix;
std::unique_ptr<CompressedRowSparseMatrix> matrix_cpu;
for (auto _ : state) {
matrix_cpu = jacobian->ToCompressedRowSparseMatrix();
matrix = std::make_unique<CudaSparseMatrix>(context, *matrix_cpu);
}
CHECK(matrix != nullptr);
}
// Updating values in CudaBlockSparseCRSView should be +- as fast as just
// copying values (time spent in value permutation has to be hidden by PCIe
// transfer)
static void JacobianToCRSViewUpdate(benchmark::State& state,
BALData* data,
ContextImpl* context) {
auto jacobian = data->BlockSparseJacobian(context);
auto matrix = CudaBlockSparseCRSView(*jacobian, context);
for (auto _ : state) {
matrix.UpdateValues(*jacobian);
}
}
static void JacobianToCRSMatrixUpdate(benchmark::State& state,
BALData* data,
ContextImpl* context) {
auto jacobian = data->BlockSparseJacobian(context);
auto matrix_cpu = jacobian->ToCompressedRowSparseMatrix();
auto matrix = std::make_unique<CudaSparseMatrix>(context, *matrix_cpu);
for (auto _ : state) {
CHECK_EQ(cudaSuccess,
cudaMemcpy(matrix->mutable_values(),
matrix_cpu->values(),
matrix->num_nonzeros() * sizeof(double),
cudaMemcpyHostToDevice));
}
}
#endif
static void JacobianSquaredColumnNorm(benchmark::State& state,
BALData* data,
ContextImpl* context) {
const int num_threads = static_cast<int>(state.range(0));
auto jacobian = data->BlockSparseJacobian(context);
Vector x = Vector::Zero(jacobian->num_cols());
for (auto _ : state) {
jacobian->SquaredColumnNorm(x.data(), context, num_threads);
}
CHECK_GT(x.squaredNorm(), 0.);
}
static void JacobianScaleColumns(benchmark::State& state,
BALData* data,
ContextImpl* context) {
const int num_threads = static_cast<int>(state.range(0));
auto jacobian_const = data->BlockSparseJacobian(context);
auto jacobian = const_cast<BlockSparseMatrix*>(jacobian_const);
Vector x = Vector::Ones(jacobian->num_cols());
for (auto _ : state) {
jacobian->ScaleColumns(x.data(), context, num_threads);
}
}
static void JacobianRightMultiplyAndAccumulate(benchmark::State& state,
BALData* data,
ContextImpl* context) {
const int num_threads = static_cast<int>(state.range(0));
auto jacobian = data->BlockSparseJacobian(context);
Vector y = Vector::Zero(jacobian->num_rows());
Vector x = Vector::Random(jacobian->num_cols());
for (auto _ : state) {
jacobian->RightMultiplyAndAccumulate(
x.data(), y.data(), context, num_threads);
}
CHECK_GT(y.squaredNorm(), 0.);
}
static void JacobianLeftMultiplyAndAccumulate(benchmark::State& state,
BALData* data,
ContextImpl* context) {
const int num_threads = static_cast<int>(state.range(0));
auto jacobian = data->BlockSparseJacobian(context);
Vector y = Vector::Zero(jacobian->num_cols());
Vector x = Vector::Random(jacobian->num_rows());
for (auto _ : state) {
jacobian->LeftMultiplyAndAccumulate(
x.data(), y.data(), context, num_threads);
}
CHECK_GT(y.squaredNorm(), 0.);
}
#ifndef CERES_NO_CUDA
static void JacobianRightMultiplyAndAccumulateCuda(benchmark::State& state,
BALData* data,
ContextImpl* context) {
auto crs_jacobian = data->CompressedRowSparseJacobian(context);
CudaSparseMatrix cuda_jacobian(context, *crs_jacobian);
CudaVector cuda_x(context, 0);
CudaVector cuda_y(context, 0);
Vector x(crs_jacobian->num_cols());
Vector y(crs_jacobian->num_rows());
x.setRandom();
y.setRandom();
cuda_x.CopyFromCpu(x);
cuda_y.CopyFromCpu(y);
double sum = 0;
for (auto _ : state) {
cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y);
sum += cuda_y.Norm();
CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
}
CHECK_NE(sum, 0.0);
}
static void JacobianLeftMultiplyAndAccumulateCuda(benchmark::State& state,
BALData* data,
ContextImpl* context) {
auto crs_jacobian = data->CompressedRowSparseJacobian(context);
CudaSparseMatrix cuda_jacobian(context, *crs_jacobian);
CudaVector cuda_x(context, 0);
CudaVector cuda_y(context, 0);
Vector x(crs_jacobian->num_rows());
Vector y(crs_jacobian->num_cols());
x.setRandom();
y.setRandom();
cuda_x.CopyFromCpu(x);
cuda_y.CopyFromCpu(y);
double sum = 0;
for (auto _ : state) {
cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
sum += cuda_y.Norm();
CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
}
CHECK_NE(sum, 0.0);
}
#endif
} // namespace ceres::internal
// Older versions of benchmark library might come without ::benchmark::Shutdown
// function. We provide an empty fallback variant of Shutdown function in
// order to support both older and newer versions
namespace benchmark_shutdown_fallback {
template <typename... Args>
void Shutdown(Args... args) {}
}; // namespace benchmark_shutdown_fallback
int main(int argc, char** argv) {
::benchmark::Initialize(&argc, argv);
std::vector<std::unique_ptr<ceres::internal::BALData>> benchmark_data;
if (argc == 1) {
LOG(FATAL) << "No input datasets specified. Usage: " << argv[0]
<< " [benchmark flags] path_to_BAL_data_1.txt ... "
"path_to_BAL_data_N.txt";
return -1;
}
ceres::internal::ContextImpl context;
context.EnsureMinimumThreads(16);
#ifndef CERES_NO_CUDA
std::string message;
context.InitCuda(&message);
#endif
for (int i = 1; i < argc; ++i) {
const std::string path(argv[i]);
const std::string name_residuals = "Residuals<" + path + ">";
benchmark_data.emplace_back(
std::make_unique<ceres::internal::BALData>(path));
auto data = benchmark_data.back().get();
::benchmark::RegisterBenchmark(
name_residuals.c_str(), ceres::internal::Residuals, data, &context)
->Arg(1)
->Arg(2)
->Arg(4)
->Arg(8)
->Arg(16);
const std::string name_jacobians = "ResidualsAndJacobian<" + path + ">";
::benchmark::RegisterBenchmark(name_jacobians.c_str(),
ceres::internal::ResidualsAndJacobian,
data,
&context)
->Arg(1)
->Arg(2)
->Arg(4)
->Arg(8)
->Arg(16);
const std::string name_plus = "Plus<" + path + ">";
::benchmark::RegisterBenchmark(
name_plus.c_str(), ceres::internal::Plus, data, &context)
->Arg(1)
->Arg(2)
->Arg(4)
->Arg(8)
->Arg(16);
const std::string name_right_product =
"JacobianRightMultiplyAndAccumulate<" + path + ">";
::benchmark::RegisterBenchmark(
name_right_product.c_str(),
ceres::internal::JacobianRightMultiplyAndAccumulate,
data,
&context)
->Arg(1)
->Arg(2)
->Arg(4)
->Arg(8)
->Arg(16);
const std::string name_right_product_partitioned_f =
"PMVRightMultiplyAndAccumulateF<" + path + ">";
::benchmark::RegisterBenchmark(
name_right_product_partitioned_f.c_str(),
ceres::internal::PMVRightMultiplyAndAccumulateF,
data,
&context)
->Arg(1)
->Arg(2)
->Arg(4)
->Arg(8)
->Arg(16);
#ifndef CERES_NO_CUDA
const std::string name_right_product_partitioned_f_cuda =
"PMVRightMultiplyAndAccumulateFCuda<" + path + ">";
::benchmark::RegisterBenchmark(
name_right_product_partitioned_f_cuda.c_str(),
ceres::internal::PMVRightMultiplyAndAccumulateFCuda,
data,
&context);
#endif
const std::string name_right_product_partitioned_e =
"PMVRightMultiplyAndAccumulateE<" + path + ">";
::benchmark::RegisterBenchmark(
name_right_product_partitioned_e.c_str(),
ceres::internal::PMVRightMultiplyAndAccumulateE,
data,
&context)
->Arg(1)
->Arg(2)
->Arg(4)
->Arg(8)
->Arg(16);
#ifndef CERES_NO_CUDA
const std::string name_right_product_partitioned_e_cuda =
"PMVRightMultiplyAndAccumulateECuda<" + path + ">";
::benchmark::RegisterBenchmark(
name_right_product_partitioned_e_cuda.c_str(),
ceres::internal::PMVRightMultiplyAndAccumulateECuda,
data,
&context);
#endif
const std::string name_update_block_diagonal_ftf =
"PMVUpdateBlockDiagonalFtF<" + path + ">";
::benchmark::RegisterBenchmark(name_update_block_diagonal_ftf.c_str(),
ceres::internal::PMVUpdateBlockDiagonalFtF,
data,
&context)
->Arg(1)
->Arg(2)
->Arg(4)
->Arg(8)
->Arg(16);
const std::string name_pse =
"PSEPreconditionerRightMultiplyAndAccumulate<" + path + ">";
::benchmark::RegisterBenchmark(
name_pse.c_str(), ceres::internal::PSEPreconditioner, data, &context)
->Arg(1)
->Arg(2)
->Arg(4)
->Arg(8)
->Arg(16);
const std::string name_isc_no_diag =
"ISCRightMultiplyAndAccumulate<" + path + ">";
::benchmark::RegisterBenchmark(name_isc_no_diag.c_str(),
ceres::internal::ISCRightMultiplyNoDiag,
data,
&context)
->Arg(1)
->Arg(2)
->Arg(4)
->Arg(8)
->Arg(16);
const std::string name_update_block_diagonal_ete =
"PMVUpdateBlockDiagonalEtE<" + path + ">";
::benchmark::RegisterBenchmark(name_update_block_diagonal_ete.c_str(),
ceres::internal::PMVUpdateBlockDiagonalEtE,
data,
&context)
->Arg(1)
->Arg(2)
->Arg(4)
->Arg(8)
->Arg(16);
const std::string name_isc_diag =
"ISCRightMultiplyAndAccumulateDiag<" + path + ">";
::benchmark::RegisterBenchmark(name_isc_diag.c_str(),
ceres::internal::ISCRightMultiplyDiag,
data,
&context)
->Arg(1)
->Arg(2)
->Arg(4)
->Arg(8)
->Arg(16);
#ifndef CERES_NO_CUDA
const std::string name_right_product_cuda =
"JacobianRightMultiplyAndAccumulateCuda<" + path + ">";
::benchmark::RegisterBenchmark(
name_right_product_cuda.c_str(),
ceres::internal::JacobianRightMultiplyAndAccumulateCuda,
data,
&context)
->Arg(1);
#endif
const std::string name_left_product =
"JacobianLeftMultiplyAndAccumulate<" + path + ">";
::benchmark::RegisterBenchmark(
name_left_product.c_str(),
ceres::internal::JacobianLeftMultiplyAndAccumulate,
data,
&context)
->Arg(1)
->Arg(2)
->Arg(4)
->Arg(8)
->Arg(16);
const std::string name_left_product_partitioned_f =
"PMVLeftMultiplyAndAccumulateF<" + path + ">";
::benchmark::RegisterBenchmark(
name_left_product_partitioned_f.c_str(),
ceres::internal::PMVLeftMultiplyAndAccumulateF,
data,
&context)
->Arg(1)
->Arg(2)
->Arg(4)
->Arg(8)
->Arg(16);
#ifndef CERES_NO_CUDA
const std::string name_left_product_partitioned_f_cuda =
"PMVLeftMultiplyAndAccumulateFCuda<" + path + ">";
::benchmark::RegisterBenchmark(
name_left_product_partitioned_f_cuda.c_str(),
ceres::internal::PMVLeftMultiplyAndAccumulateFCuda,
data,
&context);
#endif
const std::string name_left_product_partitioned_e =
"PMVLeftMultiplyAndAccumulateE<" + path + ">";
::benchmark::RegisterBenchmark(
name_left_product_partitioned_e.c_str(),
ceres::internal::PMVLeftMultiplyAndAccumulateE,
data,
&context)
->Arg(1)
->Arg(2)
->Arg(4)
->Arg(8)
->Arg(16);
#ifndef CERES_NO_CUDA
const std::string name_left_product_partitioned_e_cuda =
"PMVLeftMultiplyAndAccumulateECuda<" + path + ">";
::benchmark::RegisterBenchmark(
name_left_product_partitioned_e_cuda.c_str(),
ceres::internal::PMVLeftMultiplyAndAccumulateECuda,
data,
&context);
#endif
#ifndef CERES_NO_CUDA
const std::string name_left_product_cuda =
"JacobianLeftMultiplyAndAccumulateCuda<" + path + ">";
::benchmark::RegisterBenchmark(
name_left_product_cuda.c_str(),
ceres::internal::JacobianLeftMultiplyAndAccumulateCuda,
data,
&context)
->Arg(1);
#endif
const std::string name_squared_column_norm =
"JacobianSquaredColumnNorm<" + path + ">";
::benchmark::RegisterBenchmark(name_squared_column_norm.c_str(),
ceres::internal::JacobianSquaredColumnNorm,
data,
&context)
->Arg(1)
->Arg(2)
->Arg(4)
->Arg(8)
->Arg(16);
const std::string name_scale_columns = "JacobianScaleColumns<" + path + ">";
::benchmark::RegisterBenchmark(name_scale_columns.c_str(),
ceres::internal::JacobianScaleColumns,
data,
&context)
->Arg(1)
->Arg(2)
->Arg(4)
->Arg(8)
->Arg(16);
const std::string name_to_crs = "JacobianToCRS<" + path + ">";
::benchmark::RegisterBenchmark(
name_to_crs.c_str(), ceres::internal::JacobianToCRS, data, &context);
#ifndef CERES_NO_CUDA
const std::string name_to_crs_view = "JacobianToCRSView<" + path + ">";
::benchmark::RegisterBenchmark(name_to_crs_view.c_str(),
ceres::internal::JacobianToCRSView,
data,
&context);
const std::string name_to_crs_matrix = "JacobianToCRSMatrix<" + path + ">";
::benchmark::RegisterBenchmark(name_to_crs_matrix.c_str(),
ceres::internal::JacobianToCRSMatrix,
data,
&context);
const std::string name_to_crs_view_update =
"JacobianToCRSViewUpdate<" + path + ">";
::benchmark::RegisterBenchmark(name_to_crs_view_update.c_str(),
ceres::internal::JacobianToCRSViewUpdate,
data,
&context);
const std::string name_to_crs_matrix_update =
"JacobianToCRSMatrixUpdate<" + path + ">";
::benchmark::RegisterBenchmark(name_to_crs_matrix_update.c_str(),
ceres::internal::JacobianToCRSMatrixUpdate,
data,
&context);
#endif
}
::benchmark::RunSpecifiedBenchmarks();
using namespace ::benchmark;
using namespace benchmark_shutdown_fallback;
Shutdown();
return 0;
}