Add multiplication benchmarks on BAL data Change-Id: I3cf122c689b6de789f2c697a3bb37dbe3b531b52

diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index cdbfd2e..e08e113 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc

@@ -33,7 +33,10 @@
 #include <vector>
 
 #include "benchmark/benchmark.h"
+#include "ceres/block_sparse_matrix.h"
 #include "ceres/bundle_adjustment_test_util.h"
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/cuda_vector.h"
 #include "ceres/evaluator.h"
 #include "ceres/problem.h"
 #include "ceres/problem_impl.h"
@@ -43,9 +46,15 @@
 
 namespace ceres::internal {
 
+template <typename Derived, typename Base>
+std::unique_ptr<Derived> downcast_unique_ptr(std::unique_ptr<Base>& base) {
+  return std::unique_ptr<Derived>(dynamic_cast<Derived*>(base.release()));
+}
+
 // Benchmark library might invoke benchmark function multiple times.
 // In order to save time required to parse BAL data, we ensure that
 // each dataset is being loaded at most once.
+// Each type of jacobians is also cached after first creation
 struct BALData {
   explicit BALData(const std::string& path) {
     bal_problem = std::make_unique<BundleAdjustmentProblem>(path);
@@ -57,8 +66,68 @@
     program->ParameterBlocksToStateVector(parameters.data());
   }
 
+  std::unique_ptr<BlockSparseMatrix> CreateBlockSparseJacobian(
+      ContextImpl* context) {
+    auto problem = bal_problem->mutable_problem();
+    auto problem_impl = problem->mutable_impl();
+    CHECK(problem_impl != nullptr);
+
+    Evaluator::Options options;
+    options.linear_solver_type = ITERATIVE_SCHUR;
+    options.num_threads = 1;
+    options.context = context;
+    options.num_eliminate_blocks = 0;
+
+    std::string error;
+    auto program = problem_impl->mutable_program();
+    auto evaluator = Evaluator::Create(options, program, &error);
+    CHECK(evaluator != nullptr);
+
+    auto jacobian = evaluator->CreateJacobian();
+    auto block_sparse = downcast_unique_ptr<BlockSparseMatrix>(jacobian);
+    CHECK_NE(block_sparse.get(), nullptr);
+
+    std::mt19937 rng;
+    std::normal_distribution<double> rnorm;
+    const int nnz = block_sparse->num_nonzeros();
+    auto values = block_sparse->mutable_values();
+    for (int i = 0; i < nnz; ++i) {
+      values[i] = rnorm(rng);
+    }
+    return block_sparse;
+  }
+
+  std::unique_ptr<CompressedRowSparseMatrix> CreateCompressedRowSparseJacobian(
+      ContextImpl* context) {
+    auto block_sparse = BlockSparseJacobian(context);
+    auto crs_jacobian = std::make_unique<CompressedRowSparseMatrix>(
+        block_sparse->num_rows(),
+        block_sparse->num_cols(),
+        block_sparse->num_nonzeros());
+
+    block_sparse->ToCompressedRowSparseMatrix(crs_jacobian.get());
+    return crs_jacobian;
+  }
+
+  const BlockSparseMatrix* BlockSparseJacobian(ContextImpl* context) {
+    if (!block_sparse_jacobian) {
+      block_sparse_jacobian = CreateBlockSparseJacobian(context);
+    }
+    return block_sparse_jacobian.get();
+  }
+
+  const CompressedRowSparseMatrix* CompressedRowSparseJacobian(
+      ContextImpl* context) {
+    if (!crs_jacobian) {
+      crs_jacobian = CreateCompressedRowSparseJacobian(context);
+    }
+    return crs_jacobian.get();
+  }
+
   Vector parameters;
   std::unique_ptr<BundleAdjustmentProblem> bal_problem;
+  std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian;
+  std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian;
 };
 
 static void Residuals(benchmark::State& state,
@@ -128,6 +197,87 @@
   }
 }
 
+static void JacobianRightMultiplyAndAccumulate(benchmark::State& state,
+                                               BALData* data,
+                                               ContextImpl* context) {
+  const int num_threads = state.range(0);
+
+  auto jacobian = data->BlockSparseJacobian(context);
+
+  Vector y = Vector::Zero(jacobian->num_rows());
+  Vector x = Vector::Random(jacobian->num_cols());
+
+  for (auto _ : state) {
+    jacobian->RightMultiplyAndAccumulate(
+        x.data(), y.data(), context, num_threads);
+  }
+  CHECK_GT(y.squaredNorm(), 0.);
+}
+
+static void JacobianLeftMultiplyAndAccumulate(benchmark::State& state,
+                                              BALData* data,
+                                              ContextImpl* context) {
+  auto jacobian = data->BlockSparseJacobian(context);
+
+  Vector y = Vector::Zero(jacobian->num_cols());
+  Vector x = Vector::Random(jacobian->num_rows());
+
+  for (auto _ : state) {
+    jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
+  }
+  CHECK_GT(y.squaredNorm(), 0.);
+}
+
+#ifndef CERES_NO_CUDA
+static void JacobianRightMultiplyAndAccumulateCuda(benchmark::State& state,
+                                                   BALData* data,
+                                                   ContextImpl* context) {
+  auto crs_jacobian = data->CompressedRowSparseJacobian(context);
+  CudaSparseMatrix cuda_jacobian(context, *crs_jacobian);
+  CudaVector cuda_x(context, 0);
+  CudaVector cuda_y(context, 0);
+
+  Vector x(crs_jacobian->num_cols());
+  Vector y(crs_jacobian->num_rows());
+  x.setRandom();
+  y.setRandom();
+
+  cuda_x.CopyFromCpu(x);
+  cuda_y.CopyFromCpu(y);
+  double sum = 0;
+  for (auto _ : state) {
+    cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y);
+    sum += cuda_y.Norm();
+    CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
+  }
+  CHECK_NE(sum, 0.0);
+}
+
+static void JacobianLeftMultiplyAndAccumulateCuda(benchmark::State& state,
+                                                  BALData* data,
+                                                  ContextImpl* context) {
+  auto crs_jacobian = data->CompressedRowSparseJacobian(context);
+  CudaSparseMatrix cuda_jacobian(context, *crs_jacobian);
+  CudaVector cuda_x(context, 0);
+  CudaVector cuda_y(context, 0);
+
+  Vector x(crs_jacobian->num_rows());
+  Vector y(crs_jacobian->num_cols());
+  x.setRandom();
+  y.setRandom();
+
+  cuda_x.CopyFromCpu(x);
+  cuda_y.CopyFromCpu(y);
+  double sum = 0;
+  for (auto _ : state) {
+    cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
+    sum += cuda_y.Norm();
+    CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
+  }
+  CHECK_NE(sum, 0.0);
+}
+#endif
+
 }  // namespace ceres::internal
 
 // Older versions of benchmark library might come without ::benchmark::Shutdown
@@ -151,6 +301,10 @@
 
   ceres::internal::ContextImpl context;
   context.EnsureMinimumThreads(16);
+#ifndef CERES_NO_CUDA
+  std::string message;
+  context.InitCuda(&message);
+#endif
 
   for (int i = 1; i < argc; ++i) {
     const std::string path(argv[i]);
@@ -175,8 +329,50 @@
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);
-  }
+    const std::string name_right_product =
+        "JacobianRightMultiplyAndAccumulate<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_right_product.c_str(),
+        ceres::internal::JacobianRightMultiplyAndAccumulate,
+        data,
+        &context)
+        ->Arg(1)
+        ->Arg(2)
+        ->Arg(4)
+        ->Arg(8)
+        ->Arg(16);
 
+#ifndef CERES_NO_CUDA
+    const std::string name_right_product_cuda =
+        "JacobianRightMultiplyAndAccumulateCuda<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_right_product_cuda.c_str(),
+        ceres::internal::JacobianRightMultiplyAndAccumulateCuda,
+        data,
+        &context)
+        ->Arg(1);
+#endif
+
+    const std::string name_left_product =
+        "JacobianLeftMultiplyAndAccumulate<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_left_product.c_str(),
+        ceres::internal::JacobianLeftMultiplyAndAccumulate,
+        data,
+        &context)
+        ->Arg(1);
+
+#ifndef CERES_NO_CUDA
+    const std::string name_left_product_cuda =
+        "JacobianLeftMultiplyAndAccumulateCuda<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_right_product_cuda.c_str(),
+        ceres::internal::JacobianLeftMultiplyAndAccumulateCuda,
+        data,
+        &context)
+        ->Arg(1);
+#endif
+  }
   ::benchmark::RunSpecifiedBenchmarks();
 
   using namespace ::benchmark;