Add multiplication benchmarks on BAL data Change-Id: I3cf122c689b6de789f2c697a3bb37dbe3b531b52
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc index cdbfd2e..e08e113 100644 --- a/internal/ceres/evaluation_benchmark.cc +++ b/internal/ceres/evaluation_benchmark.cc
@@ -33,7 +33,10 @@ #include <vector> #include "benchmark/benchmark.h" +#include "ceres/block_sparse_matrix.h" #include "ceres/bundle_adjustment_test_util.h" +#include "ceres/cuda_sparse_matrix.h" +#include "ceres/cuda_vector.h" #include "ceres/evaluator.h" #include "ceres/problem.h" #include "ceres/problem_impl.h" @@ -43,9 +46,15 @@ namespace ceres::internal { +template <typename Derived, typename Base> +std::unique_ptr<Derived> downcast_unique_ptr(std::unique_ptr<Base>& base) { + return std::unique_ptr<Derived>(dynamic_cast<Derived*>(base.release())); +} + // Benchmark library might invoke benchmark function multiple times. // In order to save time required to parse BAL data, we ensure that // each dataset is being loaded at most once. +// Each type of jacobians is also cached after first creation struct BALData { explicit BALData(const std::string& path) { bal_problem = std::make_unique<BundleAdjustmentProblem>(path); @@ -57,8 +66,68 @@ program->ParameterBlocksToStateVector(parameters.data()); } + std::unique_ptr<BlockSparseMatrix> CreateBlockSparseJacobian( + ContextImpl* context) { + auto problem = bal_problem->mutable_problem(); + auto problem_impl = problem->mutable_impl(); + CHECK(problem_impl != nullptr); + + Evaluator::Options options; + options.linear_solver_type = ITERATIVE_SCHUR; + options.num_threads = 1; + options.context = context; + options.num_eliminate_blocks = 0; + + std::string error; + auto program = problem_impl->mutable_program(); + auto evaluator = Evaluator::Create(options, program, &error); + CHECK(evaluator != nullptr); + + auto jacobian = evaluator->CreateJacobian(); + auto block_sparse = downcast_unique_ptr<BlockSparseMatrix>(jacobian); + CHECK_NE(block_sparse.get(), nullptr); + + std::mt19937 rng; + std::normal_distribution<double> rnorm; + const int nnz = block_sparse->num_nonzeros(); + auto values = block_sparse->mutable_values(); + for (int i = 0; i < nnz; ++i) { + values[i] = rnorm(rng); + } + return block_sparse; + } + + std::unique_ptr<CompressedRowSparseMatrix> CreateCompressedRowSparseJacobian( + ContextImpl* context) { + auto block_sparse = BlockSparseJacobian(context); + auto crs_jacobian = std::make_unique<CompressedRowSparseMatrix>( + block_sparse->num_rows(), + block_sparse->num_cols(), + block_sparse->num_nonzeros()); + + block_sparse->ToCompressedRowSparseMatrix(crs_jacobian.get()); + return crs_jacobian; + } + + const BlockSparseMatrix* BlockSparseJacobian(ContextImpl* context) { + if (!block_sparse_jacobian) { + block_sparse_jacobian = CreateBlockSparseJacobian(context); + } + return block_sparse_jacobian.get(); + } + + const CompressedRowSparseMatrix* CompressedRowSparseJacobian( + ContextImpl* context) { + if (!crs_jacobian) { + crs_jacobian = CreateCompressedRowSparseJacobian(context); + } + return crs_jacobian.get(); + } + Vector parameters; std::unique_ptr<BundleAdjustmentProblem> bal_problem; + std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian; + std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian; }; static void Residuals(benchmark::State& state, @@ -128,6 +197,87 @@ } } +static void JacobianRightMultiplyAndAccumulate(benchmark::State& state, + BALData* data, + ContextImpl* context) { + const int num_threads = state.range(0); + + auto jacobian = data->BlockSparseJacobian(context); + + Vector y = Vector::Zero(jacobian->num_rows()); + Vector x = Vector::Random(jacobian->num_cols()); + + for (auto _ : state) { + jacobian->RightMultiplyAndAccumulate( + x.data(), y.data(), context, num_threads); + } + CHECK_GT(y.squaredNorm(), 0.); +} + +static void JacobianLeftMultiplyAndAccumulate(benchmark::State& state, + BALData* data, + ContextImpl* context) { + auto jacobian = data->BlockSparseJacobian(context); + + Vector y = Vector::Zero(jacobian->num_cols()); + Vector x = Vector::Random(jacobian->num_rows()); + + for (auto _ : state) { + jacobian->LeftMultiplyAndAccumulate(x.data(), y.data()); + } + CHECK_GT(y.squaredNorm(), 0.); +} + +#ifndef CERES_NO_CUDA +static void JacobianRightMultiplyAndAccumulateCuda(benchmark::State& state, + BALData* data, + ContextImpl* context) { + auto crs_jacobian = data->CompressedRowSparseJacobian(context); + CudaSparseMatrix cuda_jacobian(context, *crs_jacobian); + CudaVector cuda_x(context, 0); + CudaVector cuda_y(context, 0); + + Vector x(crs_jacobian->num_cols()); + Vector y(crs_jacobian->num_rows()); + x.setRandom(); + y.setRandom(); + + cuda_x.CopyFromCpu(x); + cuda_y.CopyFromCpu(y); + double sum = 0; + for (auto _ : state) { + cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y); + sum += cuda_y.Norm(); + CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess); + } + CHECK_NE(sum, 0.0); +} + +static void JacobianLeftMultiplyAndAccumulateCuda(benchmark::State& state, + BALData* data, + ContextImpl* context) { + auto crs_jacobian = data->CompressedRowSparseJacobian(context); + CudaSparseMatrix cuda_jacobian(context, *crs_jacobian); + CudaVector cuda_x(context, 0); + CudaVector cuda_y(context, 0); + + Vector x(crs_jacobian->num_rows()); + Vector y(crs_jacobian->num_cols()); + x.setRandom(); + y.setRandom(); + + cuda_x.CopyFromCpu(x); + cuda_y.CopyFromCpu(y); + double sum = 0; + for (auto _ : state) { + cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y); + sum += cuda_y.Norm(); + CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess); + } + CHECK_NE(sum, 0.0); +} +#endif + } // namespace ceres::internal // Older versions of benchmark library might come without ::benchmark::Shutdown @@ -151,6 +301,10 @@ ceres::internal::ContextImpl context; context.EnsureMinimumThreads(16); +#ifndef CERES_NO_CUDA + std::string message; + context.InitCuda(&message); +#endif for (int i = 1; i < argc; ++i) { const std::string path(argv[i]); @@ -175,8 +329,50 @@ ->Arg(4) ->Arg(8) ->Arg(16); - } + const std::string name_right_product = + "JacobianRightMultiplyAndAccumulate<" + path + ">"; + ::benchmark::RegisterBenchmark( + name_right_product.c_str(), + ceres::internal::JacobianRightMultiplyAndAccumulate, + data, + &context) + ->Arg(1) + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16); +#ifndef CERES_NO_CUDA + const std::string name_right_product_cuda = + "JacobianRightMultiplyAndAccumulateCuda<" + path + ">"; + ::benchmark::RegisterBenchmark( + name_right_product_cuda.c_str(), + ceres::internal::JacobianRightMultiplyAndAccumulateCuda, + data, + &context) + ->Arg(1); +#endif + + const std::string name_left_product = + "JacobianLeftMultiplyAndAccumulate<" + path + ">"; + ::benchmark::RegisterBenchmark( + name_left_product.c_str(), + ceres::internal::JacobianLeftMultiplyAndAccumulate, + data, + &context) + ->Arg(1); + +#ifndef CERES_NO_CUDA + const std::string name_left_product_cuda = + "JacobianLeftMultiplyAndAccumulateCuda<" + path + ">"; + ::benchmark::RegisterBenchmark( + name_right_product_cuda.c_str(), + ceres::internal::JacobianLeftMultiplyAndAccumulateCuda, + data, + &context) + ->Arg(1); +#endif + } ::benchmark::RunSpecifiedBenchmarks(); using namespace ::benchmark;