internal/ceres/evaluation_benchmark.cc - ceres-solver - Git at Google

 // Ceres Solver - A fast non-linear least squares minimizer
 // Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // * Redistributions of source code must retain the above copyright notice,
 //   this list of conditions and the following disclaimer.
 // * Redistributions in binary form must reproduce the above copyright notice,
 //   this list of conditions and the following disclaimer in the documentation
 //   and/or other materials provided with the distribution.
 // * Neither the name of Google Inc. nor the names of its contributors may be
 //   used to endorse or promote products derived from this software without
 //   specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)

 #include <memory>
 #include <random>
 #include <string>
 #include <vector>

 #include "benchmark/benchmark.h"
 #include "ceres/block_sparse_matrix.h"
 #include "ceres/bundle_adjustment_test_util.h"
 #include "ceres/cuda_block_sparse_crs_view.h"
 #include "ceres/cuda_partitioned_block_sparse_crs_view.h"
 #include "ceres/cuda_sparse_matrix.h"
 #include "ceres/cuda_vector.h"
 #include "ceres/evaluator.h"
 #include "ceres/implicit_schur_complement.h"
 #include "ceres/partitioned_matrix_view.h"
 #include "ceres/power_series_expansion_preconditioner.h"
 #include "ceres/preprocessor.h"
 #include "ceres/problem.h"
 #include "ceres/problem_impl.h"
 #include "ceres/program.h"
 #include "ceres/sparse_matrix.h"

 namespace ceres::internal {

 template <typename Derived, typename Base>
 std::unique_ptr<Derived> downcast_unique_ptr(std::unique_ptr<Base>& base) {
   return std::unique_ptr<Derived>(dynamic_cast<Derived*>(base.release()));
 }

 // Benchmark library might invoke benchmark function multiple times.
 // In order to save time required to parse BAL data, we ensure that
 // each dataset is being loaded at most once.
 // Each type of jacobians is also cached after first creation
 struct BALData {
   using PartitionedView = PartitionedMatrixView<2, 3, 9>;
   explicit BALData(const std::string& path) {
     bal_problem = std::make_unique<BundleAdjustmentProblem>(path);
     CHECK(bal_problem != nullptr);

     auto problem_impl = bal_problem->mutable_problem()->mutable_impl();
     auto preprocessor = Preprocessor::Create(MinimizerType::TRUST_REGION);

     preprocessed_problem = std::make_unique<PreprocessedProblem>();
     Solver::Options options = bal_problem->options();
     options.linear_solver_type = ITERATIVE_SCHUR;
     CHECK(preprocessor->Preprocess(
         options, problem_impl, preprocessed_problem.get()));

     auto program = preprocessed_problem->reduced_program.get();

     parameters.resize(program->NumParameters());
     program->ParameterBlocksToStateVector(parameters.data());

     const int num_residuals = program->NumResiduals();
     b.resize(num_residuals);

     std::mt19937 rng;
     std::normal_distribution<double> rnorm;
     for (int i = 0; i < num_residuals; ++i) {
       b[i] = rnorm(rng);
     }

     const int num_parameters = program->NumParameters();
     D.resize(num_parameters);
     for (int i = 0; i < num_parameters; ++i) {
       D[i] = rnorm(rng);
     }
   }

   std::unique_ptr<BlockSparseMatrix> CreateBlockSparseJacobian(
       ContextImpl* context, bool sequential) {
     auto problem = bal_problem->mutable_problem();
     auto problem_impl = problem->mutable_impl();
     CHECK(problem_impl != nullptr);

     Evaluator::Options options;
     options.linear_solver_type = ITERATIVE_SCHUR;
     options.num_threads = 1;
     options.context = context;
     options.num_eliminate_blocks = bal_problem->num_points();

     std::string error;
     auto program = preprocessed_problem->reduced_program.get();
     auto evaluator = Evaluator::Create(options, program, &error);
     CHECK(evaluator != nullptr);

     auto jacobian = evaluator->CreateJacobian();
     auto block_sparse = downcast_unique_ptr<BlockSparseMatrix>(jacobian);
     CHECK(block_sparse != nullptr);

     if (sequential) {
       auto block_structure_sequential =
           std::make_unique<CompressedRowBlockStructure>(
               *block_sparse->block_structure());
       int num_nonzeros = 0;
       for (auto& row_block : block_structure_sequential->rows) {
         const int row_block_size = row_block.block.size;
         for (auto& cell : row_block.cells) {
           const int col_block_size =
               block_structure_sequential->cols[cell.block_id].size;
           cell.position = num_nonzeros;
           num_nonzeros += col_block_size * row_block_size;
         }
       }
       block_sparse = std::make_unique<BlockSparseMatrix>(
           block_structure_sequential.release(),
 #ifndef CERES_NO_CUDA
           true
 #else
           false
 #endif
       );
     }

     std::mt19937 rng;
     std::normal_distribution<double> rnorm;
     const int nnz = block_sparse->num_nonzeros();
     auto values = block_sparse->mutable_values();
     for (int i = 0; i < nnz; ++i) {
       values[i] = rnorm(rng);
     }

     return block_sparse;
   }

   std::unique_ptr<CompressedRowSparseMatrix> CreateCompressedRowSparseJacobian(
       ContextImpl* context) {
     auto block_sparse = BlockSparseJacobian(context);
     return block_sparse->ToCompressedRowSparseMatrix();
   }

   const BlockSparseMatrix* BlockSparseJacobian(ContextImpl* context) {
     if (!block_sparse_jacobian) {
       block_sparse_jacobian = CreateBlockSparseJacobian(context, true);
     }
     return block_sparse_jacobian.get();
   }

   const BlockSparseMatrix* BlockSparseJacobianPartitioned(
       ContextImpl* context) {
     if (!block_sparse_jacobian_partitioned) {
       block_sparse_jacobian_partitioned =
           CreateBlockSparseJacobian(context, false);
     }
     return block_sparse_jacobian_partitioned.get();
   }

   const CompressedRowSparseMatrix* CompressedRowSparseJacobian(
       ContextImpl* context) {
     if (!crs_jacobian) {
       crs_jacobian = CreateCompressedRowSparseJacobian(context);
     }
     return crs_jacobian.get();
   }

   std::unique_ptr<PartitionedView> PartitionedMatrixViewJacobian(
       const LinearSolver::Options& options) {
     auto block_sparse = BlockSparseJacobianPartitioned(options.context);
     return std::make_unique<PartitionedView>(options, *block_sparse);
   }

   BlockSparseMatrix* BlockDiagonalEtE(const LinearSolver::Options& options) {
     if (!block_diagonal_ete) {
       auto partitioned_view = PartitionedMatrixViewJacobian(options);
       block_diagonal_ete = partitioned_view->CreateBlockDiagonalEtE();
     }
     return block_diagonal_ete.get();
   }

   BlockSparseMatrix* BlockDiagonalFtF(const LinearSolver::Options& options) {
     if (!block_diagonal_ftf) {
       auto partitioned_view = PartitionedMatrixViewJacobian(options);
       block_diagonal_ftf = partitioned_view->CreateBlockDiagonalFtF();
     }
     return block_diagonal_ftf.get();
   }

   const ImplicitSchurComplement* ImplicitSchurComplementWithoutDiagonal(
       const LinearSolver::Options& options) {
     auto block_sparse = BlockSparseJacobianPartitioned(options.context);
     implicit_schur_complement =
         std::make_unique<ImplicitSchurComplement>(options);
     implicit_schur_complement->Init(*block_sparse, nullptr, b.data());
     return implicit_schur_complement.get();
   }

   const ImplicitSchurComplement* ImplicitSchurComplementWithDiagonal(
       const LinearSolver::Options& options) {
     auto block_sparse = BlockSparseJacobianPartitioned(options.context);
     implicit_schur_complement_diag =
         std::make_unique<ImplicitSchurComplement>(options);
     implicit_schur_complement_diag->Init(*block_sparse, D.data(), b.data());
     return implicit_schur_complement_diag.get();
   }

   Vector parameters;
   Vector D;
   Vector b;
   std::unique_ptr<BundleAdjustmentProblem> bal_problem;
   std::unique_ptr<PreprocessedProblem> preprocessed_problem;
   std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian_partitioned;
   std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian;
   std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian;
   std::unique_ptr<BlockSparseMatrix> block_diagonal_ete;
   std::unique_ptr<BlockSparseMatrix> block_diagonal_ftf;
   std::unique_ptr<ImplicitSchurComplement> implicit_schur_complement;
   std::unique_ptr<ImplicitSchurComplement> implicit_schur_complement_diag;
 };

 static void Residuals(benchmark::State& state,
                       BALData* data,
                       ContextImpl* context) {
   const int num_threads = static_cast<int>(state.range(0));

   Evaluator::Options options;
   options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
   options.num_threads = num_threads;
   options.context = context;
   options.num_eliminate_blocks = 0;

   std::string error;
   CHECK(data->preprocessed_problem != nullptr);
   auto program = data->preprocessed_problem->reduced_program.get();
   CHECK(program != nullptr);
   auto evaluator = Evaluator::Create(options, program, &error);
   CHECK(evaluator != nullptr);

   double cost = 0.;
   Vector residuals = Vector::Zero(program->NumResiduals());

   Evaluator::EvaluateOptions eval_options;
   for (auto _ : state) {
     CHECK(evaluator->Evaluate(eval_options,
                               data->parameters.data(),
                               &cost,
                               residuals.data(),
                               nullptr,
                               nullptr));
   }
 }

 static void ResidualsAndJacobian(benchmark::State& state,
                                  BALData* data,
                                  ContextImpl* context) {
   const int num_threads = static_cast<int>(state.range(0));

   Evaluator::Options options;
   options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
   options.num_threads = num_threads;
   options.context = context;
   options.num_eliminate_blocks = 0;

   std::string error;
   CHECK(data->preprocessed_problem != nullptr);
   auto program = data->preprocessed_problem->reduced_program.get();
   CHECK(program != nullptr);
   auto evaluator = Evaluator::Create(options, program, &error);
   CHECK(evaluator != nullptr);

   double cost = 0.;
   Vector residuals = Vector::Zero(program->NumResiduals());
   auto jacobian = evaluator->CreateJacobian();

   Evaluator::EvaluateOptions eval_options;
   for (auto _ : state) {
     CHECK(evaluator->Evaluate(eval_options,
                               data->parameters.data(),
                               &cost,
                               residuals.data(),
                               nullptr,
                               jacobian.get()));
   }
 }

 static void Plus(benchmark::State& state, BALData* data, ContextImpl* context) {
   const int num_threads = static_cast<int>(state.range(0));

   Evaluator::Options options;
   options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
   options.num_threads = num_threads;
   options.context = context;
   options.num_eliminate_blocks = 0;

   std::string error;
   CHECK(data->preprocessed_problem != nullptr);
   auto program = data->preprocessed_problem->reduced_program.get();
   CHECK(program != nullptr);
   auto evaluator = Evaluator::Create(options, program, &error);
   CHECK(evaluator != nullptr);

   Vector state_plus_delta = Vector::Zero(program->NumParameters());
   Vector delta = Vector::Random(program->NumEffectiveParameters());

   for (auto _ : state) {
     CHECK(evaluator->Plus(
         data->parameters.data(), delta.data(), state_plus_delta.data()));
   }
   CHECK_GT(state_plus_delta.squaredNorm(), 0.);
 }

 static void PSEPreconditioner(benchmark::State& state,
                               BALData* data,
                               ContextImpl* context) {
   LinearSolver::Options options;
   options.num_threads = static_cast<int>(state.range(0));
   options.elimination_groups.push_back(data->bal_problem->num_points());
   options.context = context;

   auto jacobian = data->ImplicitSchurComplementWithDiagonal(options);
   Preconditioner::Options preconditioner_options(options);

   PowerSeriesExpansionPreconditioner preconditioner(
       jacobian, 10, 0, preconditioner_options);

   Vector y = Vector::Zero(jacobian->num_cols());
   Vector x = Vector::Random(jacobian->num_cols());

   for (auto _ : state) {
     preconditioner.RightMultiplyAndAccumulate(x.data(), y.data());
   }
   CHECK_GT(y.squaredNorm(), 0.);
 }

 static void PMVRightMultiplyAndAccumulateF(benchmark::State& state,
                                            BALData* data,
                                            ContextImpl* context) {
   LinearSolver::Options options;
   options.num_threads = static_cast<int>(state.range(0));
   options.elimination_groups.push_back(data->bal_problem->num_points());
   options.context = context;
   auto jacobian = data->PartitionedMatrixViewJacobian(options);

   Vector y = Vector::Zero(jacobian->num_rows());
   Vector x = Vector::Random(jacobian->num_cols_f());

   for (auto _ : state) {
     jacobian->RightMultiplyAndAccumulateF(x.data(), y.data());
   }
   CHECK_GT(y.squaredNorm(), 0.);
 }

 static void PMVLeftMultiplyAndAccumulateF(benchmark::State& state,
                                           BALData* data,
                                           ContextImpl* context) {
   LinearSolver::Options options;
   options.num_threads = static_cast<int>(state.range(0));
   options.elimination_groups.push_back(data->bal_problem->num_points());
   options.context = context;
   auto jacobian = data->PartitionedMatrixViewJacobian(options);

   Vector y = Vector::Zero(jacobian->num_cols_f());
   Vector x = Vector::Random(jacobian->num_rows());

   for (auto _ : state) {
     jacobian->LeftMultiplyAndAccumulateF(x.data(), y.data());
   }
   CHECK_GT(y.squaredNorm(), 0.);
 }

 static void PMVRightMultiplyAndAccumulateE(benchmark::State& state,
                                            BALData* data,
                                            ContextImpl* context) {
   LinearSolver::Options options;
   options.num_threads = static_cast<int>(state.range(0));
   options.elimination_groups.push_back(data->bal_problem->num_points());
   options.context = context;
   auto jacobian = data->PartitionedMatrixViewJacobian(options);

   Vector y = Vector::Zero(jacobian->num_rows());
   Vector x = Vector::Random(jacobian->num_cols_e());

   for (auto _ : state) {
     jacobian->RightMultiplyAndAccumulateE(x.data(), y.data());
   }
   CHECK_GT(y.squaredNorm(), 0.);
 }

 static void PMVLeftMultiplyAndAccumulateE(benchmark::State& state,
                                           BALData* data,
                                           ContextImpl* context) {
   LinearSolver::Options options;
   options.num_threads = static_cast<int>(state.range(0));
   options.elimination_groups.push_back(data->bal_problem->num_points());
   options.context = context;
   auto jacobian = data->PartitionedMatrixViewJacobian(options);

   Vector y = Vector::Zero(jacobian->num_cols_e());
   Vector x = Vector::Random(jacobian->num_rows());

   for (auto _ : state) {
     jacobian->LeftMultiplyAndAccumulateE(x.data(), y.data());
   }
   CHECK_GT(y.squaredNorm(), 0.);
 }

 static void PMVUpdateBlockDiagonalEtE(benchmark::State& state,
                                       BALData* data,
                                       ContextImpl* context) {
   LinearSolver::Options options;
   options.num_threads = static_cast<int>(state.range(0));
   options.elimination_groups.push_back(data->bal_problem->num_points());
   options.context = context;
   auto jacobian = data->PartitionedMatrixViewJacobian(options);
   auto block_diagonal_ete = data->BlockDiagonalEtE(options);

   for (auto _ : state) {
     jacobian->UpdateBlockDiagonalEtE(block_diagonal_ete);
   }
 }

 static void PMVUpdateBlockDiagonalFtF(benchmark::State& state,
                                       BALData* data,
                                       ContextImpl* context) {
   LinearSolver::Options options;
   options.num_threads = static_cast<int>(state.range(0));
   options.elimination_groups.push_back(data->bal_problem->num_points());
   options.context = context;
   auto jacobian = data->PartitionedMatrixViewJacobian(options);
   auto block_diagonal_ftf = data->BlockDiagonalFtF(options);

   for (auto _ : state) {
     jacobian->UpdateBlockDiagonalFtF(block_diagonal_ftf);
   }
 }

 static void ISCRightMultiplyNoDiag(benchmark::State& state,
                                    BALData* data,
                                    ContextImpl* context) {
   LinearSolver::Options options;
   options.num_threads = static_cast<int>(state.range(0));
   options.elimination_groups.push_back(data->bal_problem->num_points());
   options.context = context;
   auto jacobian = data->ImplicitSchurComplementWithoutDiagonal(options);

   Vector y = Vector::Zero(jacobian->num_rows());
   Vector x = Vector::Random(jacobian->num_cols());
   for (auto _ : state) {
     jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
   }
   CHECK_GT(y.squaredNorm(), 0.);
 }

 static void ISCRightMultiplyDiag(benchmark::State& state,
                                  BALData* data,
                                  ContextImpl* context) {
   LinearSolver::Options options;
   options.num_threads = static_cast<int>(state.range(0));
   options.elimination_groups.push_back(data->bal_problem->num_points());
   options.context = context;

   auto jacobian = data->ImplicitSchurComplementWithDiagonal(options);

   Vector y = Vector::Zero(jacobian->num_rows());
   Vector x = Vector::Random(jacobian->num_cols());
   for (auto _ : state) {
     jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
   }
   CHECK_GT(y.squaredNorm(), 0.);
 }

 static void JacobianToCRS(benchmark::State& state,
                           BALData* data,
                           ContextImpl* context) {
   auto jacobian = data->BlockSparseJacobian(context);

   std::unique_ptr<CompressedRowSparseMatrix> matrix;
   for (auto _ : state) {
     matrix = jacobian->ToCompressedRowSparseMatrix();
   }
   CHECK(matrix != nullptr);
 }

 #ifndef CERES_NO_CUDA
 static void PMVRightMultiplyAndAccumulateFCuda(benchmark::State& state,
                                                BALData* data,
                                                ContextImpl* context) {
   LinearSolver::Options options;
   options.elimination_groups.push_back(data->bal_problem->num_points());
   options.context = context;
   options.num_threads = 1;
   auto jacobian = data->PartitionedMatrixViewJacobian(options);
   auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
   CudaPartitionedBlockSparseCRSView view(
       *underlying_matrix, jacobian->num_col_blocks_e(), context);

   Vector x = Vector::Random(jacobian->num_cols_f());
   CudaVector cuda_x(context, x.size());
   CudaVector cuda_y(context, jacobian->num_rows());

   cuda_x.CopyFromCpu(x);
   cuda_y.SetZero();

   auto matrix = view.matrix_f();
   for (auto _ : state) {
     matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
   }
   CHECK_GT(cuda_y.Norm(), 0.);
 }

 static void PMVLeftMultiplyAndAccumulateFCuda(benchmark::State& state,
                                               BALData* data,
                                               ContextImpl* context) {
   LinearSolver::Options options;
   options.elimination_groups.push_back(data->bal_problem->num_points());
   options.context = context;
   options.num_threads = 1;
   auto jacobian = data->PartitionedMatrixViewJacobian(options);
   auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
   CudaPartitionedBlockSparseCRSView view(
       *underlying_matrix, jacobian->num_col_blocks_e(), context);

   Vector x = Vector::Random(jacobian->num_rows());
   CudaVector cuda_x(context, x.size());
   CudaVector cuda_y(context, jacobian->num_cols_f());

   cuda_x.CopyFromCpu(x);
   cuda_y.SetZero();

   auto matrix = view.matrix_f();
   for (auto _ : state) {
     matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
   }
   CHECK_GT(cuda_y.Norm(), 0.);
 }

 static void PMVRightMultiplyAndAccumulateECuda(benchmark::State& state,
                                                BALData* data,
                                                ContextImpl* context) {
   LinearSolver::Options options;
   options.elimination_groups.push_back(data->bal_problem->num_points());
   options.context = context;
   options.num_threads = 1;
   auto jacobian = data->PartitionedMatrixViewJacobian(options);
   auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
   CudaPartitionedBlockSparseCRSView view(
       *underlying_matrix, jacobian->num_col_blocks_e(), context);

   Vector x = Vector::Random(jacobian->num_cols_e());
   CudaVector cuda_x(context, x.size());
   CudaVector cuda_y(context, jacobian->num_rows());

   cuda_x.CopyFromCpu(x);
   cuda_y.SetZero();

   auto matrix = view.matrix_e();
   for (auto _ : state) {
     matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
   }
   CHECK_GT(cuda_y.Norm(), 0.);
 }

 static void PMVLeftMultiplyAndAccumulateECuda(benchmark::State& state,
                                               BALData* data,
                                               ContextImpl* context) {
   LinearSolver::Options options;
   options.elimination_groups.push_back(data->bal_problem->num_points());
   options.context = context;
   options.num_threads = 1;
   auto jacobian = data->PartitionedMatrixViewJacobian(options);
   auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
   CudaPartitionedBlockSparseCRSView view(
       *underlying_matrix, jacobian->num_col_blocks_e(), context);

   Vector x = Vector::Random(jacobian->num_rows());
   CudaVector cuda_x(context, x.size());
   CudaVector cuda_y(context, jacobian->num_cols_e());

   cuda_x.CopyFromCpu(x);
   cuda_y.SetZero();

   auto matrix = view.matrix_e();
   for (auto _ : state) {
     matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
   }
   CHECK_GT(cuda_y.Norm(), 0.);
 }

 // We want CudaBlockSparseCRSView to be not slower than explicit conversion to
 // CRS on CPU
 static void JacobianToCRSView(benchmark::State& state,
                               BALData* data,
                               ContextImpl* context) {
   auto jacobian = data->BlockSparseJacobian(context);

   std::unique_ptr<CudaBlockSparseCRSView> matrix;
   for (auto _ : state) {
     matrix = std::make_unique<CudaBlockSparseCRSView>(*jacobian, context);
   }
   CHECK(matrix != nullptr);
 }
 static void JacobianToCRSMatrix(benchmark::State& state,
                                 BALData* data,
                                 ContextImpl* context) {
   auto jacobian = data->BlockSparseJacobian(context);

   std::unique_ptr<CudaSparseMatrix> matrix;
   std::unique_ptr<CompressedRowSparseMatrix> matrix_cpu;
   for (auto _ : state) {
     matrix_cpu = jacobian->ToCompressedRowSparseMatrix();
     matrix = std::make_unique<CudaSparseMatrix>(context, *matrix_cpu);
   }
   CHECK(matrix != nullptr);
 }
 // Updating values in CudaBlockSparseCRSView should be +- as fast as just
 // copying values (time spent in value permutation has to be hidden by PCIe
 // transfer)
 static void JacobianToCRSViewUpdate(benchmark::State& state,
                                     BALData* data,
                                     ContextImpl* context) {
   auto jacobian = data->BlockSparseJacobian(context);

   auto matrix = CudaBlockSparseCRSView(*jacobian, context);
   for (auto _ : state) {
     matrix.UpdateValues(*jacobian);
   }
 }
 static void JacobianToCRSMatrixUpdate(benchmark::State& state,
                                       BALData* data,
                                       ContextImpl* context) {
   auto jacobian = data->BlockSparseJacobian(context);

   auto matrix_cpu = jacobian->ToCompressedRowSparseMatrix();
   auto matrix = std::make_unique<CudaSparseMatrix>(context, *matrix_cpu);
   for (auto _ : state) {
     CHECK_EQ(cudaSuccess,
              cudaMemcpy(matrix->mutable_values(),
                         matrix_cpu->values(),
                         matrix->num_nonzeros() * sizeof(double),
                         cudaMemcpyHostToDevice));
   }
 }
 #endif

 static void JacobianSquaredColumnNorm(benchmark::State& state,
                                       BALData* data,
                                       ContextImpl* context) {
   const int num_threads = static_cast<int>(state.range(0));

   auto jacobian = data->BlockSparseJacobian(context);

   Vector x = Vector::Zero(jacobian->num_cols());

   for (auto _ : state) {
     jacobian->SquaredColumnNorm(x.data(), context, num_threads);
   }
   CHECK_GT(x.squaredNorm(), 0.);
 }

 static void JacobianScaleColumns(benchmark::State& state,
                                  BALData* data,
                                  ContextImpl* context) {
   const int num_threads = static_cast<int>(state.range(0));

   auto jacobian_const = data->BlockSparseJacobian(context);
   auto jacobian = const_cast<BlockSparseMatrix*>(jacobian_const);

   Vector x = Vector::Ones(jacobian->num_cols());

   for (auto _ : state) {
     jacobian->ScaleColumns(x.data(), context, num_threads);
   }
 }

 static void JacobianRightMultiplyAndAccumulate(benchmark::State& state,
                                                BALData* data,
                                                ContextImpl* context) {
   const int num_threads = static_cast<int>(state.range(0));

   auto jacobian = data->BlockSparseJacobian(context);

   Vector y = Vector::Zero(jacobian->num_rows());
   Vector x = Vector::Random(jacobian->num_cols());

   for (auto _ : state) {
     jacobian->RightMultiplyAndAccumulate(
         x.data(), y.data(), context, num_threads);
   }
   CHECK_GT(y.squaredNorm(), 0.);
 }

 static void JacobianLeftMultiplyAndAccumulate(benchmark::State& state,
                                               BALData* data,
                                               ContextImpl* context) {
   const int num_threads = static_cast<int>(state.range(0));

   auto jacobian = data->BlockSparseJacobian(context);

   Vector y = Vector::Zero(jacobian->num_cols());
   Vector x = Vector::Random(jacobian->num_rows());

   for (auto _ : state) {
     jacobian->LeftMultiplyAndAccumulate(
         x.data(), y.data(), context, num_threads);
   }
   CHECK_GT(y.squaredNorm(), 0.);
 }

 #ifndef CERES_NO_CUDA
 static void JacobianRightMultiplyAndAccumulateCuda(benchmark::State& state,
                                                    BALData* data,
                                                    ContextImpl* context) {
   auto crs_jacobian = data->CompressedRowSparseJacobian(context);
   CudaSparseMatrix cuda_jacobian(context, *crs_jacobian);
   CudaVector cuda_x(context, 0);
   CudaVector cuda_y(context, 0);

   Vector x(crs_jacobian->num_cols());
   Vector y(crs_jacobian->num_rows());
   x.setRandom();
   y.setRandom();

   cuda_x.CopyFromCpu(x);
   cuda_y.CopyFromCpu(y);
   double sum = 0;
   for (auto _ : state) {
     cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y);
     sum += cuda_y.Norm();
     CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
   }
   CHECK_NE(sum, 0.0);
 }

 static void JacobianLeftMultiplyAndAccumulateCuda(benchmark::State& state,
                                                   BALData* data,
                                                   ContextImpl* context) {
   auto crs_jacobian = data->CompressedRowSparseJacobian(context);
   CudaSparseMatrix cuda_jacobian(context, *crs_jacobian);
   CudaVector cuda_x(context, 0);
   CudaVector cuda_y(context, 0);

   Vector x(crs_jacobian->num_rows());
   Vector y(crs_jacobian->num_cols());
   x.setRandom();
   y.setRandom();

   cuda_x.CopyFromCpu(x);
   cuda_y.CopyFromCpu(y);
   double sum = 0;
   for (auto _ : state) {
     cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
     sum += cuda_y.Norm();
     CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
   }
   CHECK_NE(sum, 0.0);
 }
 #endif

 }  // namespace ceres::internal

 // Older versions of benchmark library might come without ::benchmark::Shutdown
 // function. We provide an empty fallback variant of Shutdown function in
 // order to support both older and newer versions
 namespace benchmark_shutdown_fallback {
 template <typename... Args>
 void Shutdown(Args... args) {}
 };  // namespace benchmark_shutdown_fallback

 int main(int argc, char** argv) {
   ::benchmark::Initialize(&argc, argv);

   std::vector<std::unique_ptr<ceres::internal::BALData>> benchmark_data;
   if (argc == 1) {
     LOG(FATAL) << "No input datasets specified. Usage: " << argv[0]
                << " [benchmark flags] path_to_BAL_data_1.txt ... "
                   "path_to_BAL_data_N.txt";
     return -1;
   }

   ceres::internal::ContextImpl context;
   context.EnsureMinimumThreads(16);
 #ifndef CERES_NO_CUDA
   std::string message;
   context.InitCuda(&message);
 #endif

   for (int i = 1; i < argc; ++i) {
     const std::string path(argv[i]);
     const std::string name_residuals = "Residuals<" + path + ">";
     benchmark_data.emplace_back(
         std::make_unique<ceres::internal::BALData>(path));
     auto data = benchmark_data.back().get();
     ::benchmark::RegisterBenchmark(
         name_residuals.c_str(), ceres::internal::Residuals, data, &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);

     const std::string name_jacobians = "ResidualsAndJacobian<" + path + ">";
     ::benchmark::RegisterBenchmark(name_jacobians.c_str(),
                                    ceres::internal::ResidualsAndJacobian,
                                    data,
                                    &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);

     const std::string name_plus = "Plus<" + path + ">";
     ::benchmark::RegisterBenchmark(
         name_plus.c_str(), ceres::internal::Plus, data, &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);

     const std::string name_right_product =
         "JacobianRightMultiplyAndAccumulate<" + path + ">";
     ::benchmark::RegisterBenchmark(
         name_right_product.c_str(),
         ceres::internal::JacobianRightMultiplyAndAccumulate,
         data,
         &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);

     const std::string name_right_product_partitioned_f =
         "PMVRightMultiplyAndAccumulateF<" + path + ">";
     ::benchmark::RegisterBenchmark(
         name_right_product_partitioned_f.c_str(),
         ceres::internal::PMVRightMultiplyAndAccumulateF,
         data,
         &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);

 #ifndef CERES_NO_CUDA
     const std::string name_right_product_partitioned_f_cuda =
         "PMVRightMultiplyAndAccumulateFCuda<" + path + ">";
     ::benchmark::RegisterBenchmark(
         name_right_product_partitioned_f_cuda.c_str(),
         ceres::internal::PMVRightMultiplyAndAccumulateFCuda,
         data,
         &context);
 #endif

     const std::string name_right_product_partitioned_e =
         "PMVRightMultiplyAndAccumulateE<" + path + ">";
     ::benchmark::RegisterBenchmark(
         name_right_product_partitioned_e.c_str(),
         ceres::internal::PMVRightMultiplyAndAccumulateE,
         data,
         &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);

 #ifndef CERES_NO_CUDA
     const std::string name_right_product_partitioned_e_cuda =
         "PMVRightMultiplyAndAccumulateECuda<" + path + ">";
     ::benchmark::RegisterBenchmark(
         name_right_product_partitioned_e_cuda.c_str(),
         ceres::internal::PMVRightMultiplyAndAccumulateECuda,
         data,
         &context);
 #endif

     const std::string name_update_block_diagonal_ftf =
         "PMVUpdateBlockDiagonalFtF<" + path + ">";
     ::benchmark::RegisterBenchmark(name_update_block_diagonal_ftf.c_str(),
                                    ceres::internal::PMVUpdateBlockDiagonalFtF,
                                    data,
                                    &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);

     const std::string name_pse =
         "PSEPreconditionerRightMultiplyAndAccumulate<" + path + ">";
     ::benchmark::RegisterBenchmark(
         name_pse.c_str(), ceres::internal::PSEPreconditioner, data, &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);

     const std::string name_isc_no_diag =
         "ISCRightMultiplyAndAccumulate<" + path + ">";
     ::benchmark::RegisterBenchmark(name_isc_no_diag.c_str(),
                                    ceres::internal::ISCRightMultiplyNoDiag,
                                    data,
                                    &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);

     const std::string name_update_block_diagonal_ete =
         "PMVUpdateBlockDiagonalEtE<" + path + ">";
     ::benchmark::RegisterBenchmark(name_update_block_diagonal_ete.c_str(),
                                    ceres::internal::PMVUpdateBlockDiagonalEtE,
                                    data,
                                    &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);
     const std::string name_isc_diag =
         "ISCRightMultiplyAndAccumulateDiag<" + path + ">";
     ::benchmark::RegisterBenchmark(name_isc_diag.c_str(),
                                    ceres::internal::ISCRightMultiplyDiag,
                                    data,
                                    &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);

 #ifndef CERES_NO_CUDA
     const std::string name_right_product_cuda =
         "JacobianRightMultiplyAndAccumulateCuda<" + path + ">";
     ::benchmark::RegisterBenchmark(
         name_right_product_cuda.c_str(),
         ceres::internal::JacobianRightMultiplyAndAccumulateCuda,
         data,
         &context)
         ->Arg(1);
 #endif

     const std::string name_left_product =
         "JacobianLeftMultiplyAndAccumulate<" + path + ">";
     ::benchmark::RegisterBenchmark(
         name_left_product.c_str(),
         ceres::internal::JacobianLeftMultiplyAndAccumulate,
         data,
         &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);

     const std::string name_left_product_partitioned_f =
         "PMVLeftMultiplyAndAccumulateF<" + path + ">";
     ::benchmark::RegisterBenchmark(
         name_left_product_partitioned_f.c_str(),
         ceres::internal::PMVLeftMultiplyAndAccumulateF,
         data,
         &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);

 #ifndef CERES_NO_CUDA
     const std::string name_left_product_partitioned_f_cuda =
         "PMVLeftMultiplyAndAccumulateFCuda<" + path + ">";
     ::benchmark::RegisterBenchmark(
         name_left_product_partitioned_f_cuda.c_str(),
         ceres::internal::PMVLeftMultiplyAndAccumulateFCuda,
         data,
         &context);
 #endif

     const std::string name_left_product_partitioned_e =
         "PMVLeftMultiplyAndAccumulateE<" + path + ">";
     ::benchmark::RegisterBenchmark(
         name_left_product_partitioned_e.c_str(),
         ceres::internal::PMVLeftMultiplyAndAccumulateE,
         data,
         &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);

 #ifndef CERES_NO_CUDA
     const std::string name_left_product_partitioned_e_cuda =
         "PMVLeftMultiplyAndAccumulateECuda<" + path + ">";
     ::benchmark::RegisterBenchmark(
         name_left_product_partitioned_e_cuda.c_str(),
         ceres::internal::PMVLeftMultiplyAndAccumulateECuda,
         data,
         &context);
 #endif

 #ifndef CERES_NO_CUDA
     const std::string name_left_product_cuda =
         "JacobianLeftMultiplyAndAccumulateCuda<" + path + ">";
     ::benchmark::RegisterBenchmark(
         name_left_product_cuda.c_str(),
         ceres::internal::JacobianLeftMultiplyAndAccumulateCuda,
         data,
         &context)
         ->Arg(1);
 #endif

     const std::string name_squared_column_norm =
         "JacobianSquaredColumnNorm<" + path + ">";
     ::benchmark::RegisterBenchmark(name_squared_column_norm.c_str(),
                                    ceres::internal::JacobianSquaredColumnNorm,
                                    data,
                                    &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);

     const std::string name_scale_columns = "JacobianScaleColumns<" + path + ">";
     ::benchmark::RegisterBenchmark(name_scale_columns.c_str(),
                                    ceres::internal::JacobianScaleColumns,
                                    data,
                                    &context)
         ->Arg(1)
         ->Arg(2)
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);

     const std::string name_to_crs = "JacobianToCRS<" + path + ">";
     ::benchmark::RegisterBenchmark(
         name_to_crs.c_str(), ceres::internal::JacobianToCRS, data, &context);
 #ifndef CERES_NO_CUDA
     const std::string name_to_crs_view = "JacobianToCRSView<" + path + ">";
     ::benchmark::RegisterBenchmark(name_to_crs_view.c_str(),
                                    ceres::internal::JacobianToCRSView,
                                    data,
                                    &context);
     const std::string name_to_crs_matrix = "JacobianToCRSMatrix<" + path + ">";
     ::benchmark::RegisterBenchmark(name_to_crs_matrix.c_str(),
                                    ceres::internal::JacobianToCRSMatrix,
                                    data,
                                    &context);
     const std::string name_to_crs_view_update =
         "JacobianToCRSViewUpdate<" + path + ">";
     ::benchmark::RegisterBenchmark(name_to_crs_view_update.c_str(),
                                    ceres::internal::JacobianToCRSViewUpdate,
                                    data,
                                    &context);
     const std::string name_to_crs_matrix_update =
         "JacobianToCRSMatrixUpdate<" + path + ">";
     ::benchmark::RegisterBenchmark(name_to_crs_matrix_update.c_str(),
                                    ceres::internal::JacobianToCRSMatrixUpdate,
                                    data,
                                    &context);
 #endif
   }
   ::benchmark::RunSpecifiedBenchmarks();

   using namespace ::benchmark;
   using namespace benchmark_shutdown_fallback;
   Shutdown();
   return 0;
 }