internal/ceres/spmv_benchmark.cc - ceres-solver - Git at Google

 // Ceres Solver - A fast non-linear least squares minimizer
 // Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // * Redistributions of source code must retain the above copyright notice,
 //   this list of conditions and the following disclaimer.
 // * Redistributions in binary form must reproduce the above copyright notice,
 //   this list of conditions and the following disclaimer in the documentation
 //   and/or other materials provided with the distribution.
 // * Neither the name of Google Inc. nor the names of its contributors may be
 //   used to endorse or promote products derived from this software without
 //   specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Authors: joydeepb@cs.utexas.edu (Joydeep Biswas)

 #include <memory>
 #include <random>
 #include <string>

 #include "Eigen/Dense"
 #include "absl/log/check.h"
 #include "benchmark/benchmark.h"
 #include "ceres/block_jacobi_preconditioner.h"
 #include "ceres/block_sparse_matrix.h"
 #include "ceres/context_impl.h"
 #include "ceres/cuda_sparse_matrix.h"
 #include "ceres/cuda_vector.h"
 #include "ceres/fake_bundle_adjustment_jacobian.h"
 #include "ceres/internal/config.h"
 #include "ceres/internal/eigen.h"

 #ifndef CERES_NO_CUDA
 #include "cuda_runtime.h"
 #endif

 namespace ceres::internal {

 constexpr int kNumCameras = 1000;
 constexpr int kNumPoints = 10000;
 constexpr int kCameraSize = 6;
 constexpr int kPointSize = 3;
 constexpr double kVisibility = 0.1;

 constexpr int kNumRowBlocks = 100000;
 constexpr int kNumColBlocks = 10000;
 constexpr int kMinRowBlockSize = 1;
 constexpr int kMaxRowBlockSize = 5;
 constexpr int kMinColBlockSize = 1;
 constexpr int kMaxColBlockSize = 15;
 constexpr double kBlockDensity = 5.0 / kNumColBlocks;

 static void BM_BlockSparseRightMultiplyAndAccumulateBA(
     benchmark::State& state) {
   const int num_threads = static_cast<int>(state.range(0));
   std::mt19937 prng;
   auto jacobian = CreateFakeBundleAdjustmentJacobian(
       kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);

   ContextImpl context;
   context.EnsureMinimumThreads(num_threads);

   Vector x(jacobian->num_cols());
   Vector y(jacobian->num_rows());
   x.setRandom();
   y.setRandom();
   double sum = 0;
   for (auto _ : state) {
     jacobian->RightMultiplyAndAccumulate(
         x.data(), y.data(), &context, num_threads);
     sum += y.norm();
   }
   CHECK_NE(sum, 0.0);
 }

 BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateBA)
     ->Arg(1)
     ->Arg(2)
     ->Arg(4)
     ->Arg(8)
     ->Arg(16);

 static void BM_BlockSparseRightMultiplyAndAccumulateUnstructured(
     benchmark::State& state) {
   const int num_threads = static_cast<int>(state.range(0));
   BlockSparseMatrix::RandomMatrixOptions options;
   options.num_row_blocks = kNumRowBlocks;
   options.num_col_blocks = kNumColBlocks;
   options.min_row_block_size = kMinRowBlockSize;
   options.min_col_block_size = kMinColBlockSize;
   options.max_row_block_size = kMaxRowBlockSize;
   options.max_col_block_size = kMaxColBlockSize;
   options.block_density = kBlockDensity;
   std::mt19937 prng;

   auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);

   ContextImpl context;
   context.EnsureMinimumThreads(num_threads);

   Vector x(jacobian->num_cols());
   Vector y(jacobian->num_rows());
   x.setRandom();
   y.setRandom();
   double sum = 0;
   for (auto _ : state) {
     jacobian->RightMultiplyAndAccumulate(
         x.data(), y.data(), &context, num_threads);
     sum += y.norm();
   }
   CHECK_NE(sum, 0.0);
 }

 BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateUnstructured)
     ->Arg(1)
     ->Arg(2)
     ->Arg(4)
     ->Arg(8)
     ->Arg(16);

 static void BM_BlockSparseLeftMultiplyAndAccumulateBA(benchmark::State& state) {
   std::mt19937 prng;
   auto jacobian = CreateFakeBundleAdjustmentJacobian(
       kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
   Vector x(jacobian->num_rows());
   Vector y(jacobian->num_cols());
   x.setRandom();
   y.setRandom();
   double sum = 0;
   for (auto _ : state) {
     jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
     sum += y.norm();
   }
   CHECK_NE(sum, 0.0);
 }

 BENCHMARK(BM_BlockSparseLeftMultiplyAndAccumulateBA);

 static void BM_BlockSparseLeftMultiplyAndAccumulateUnstructured(
     benchmark::State& state) {
   BlockSparseMatrix::RandomMatrixOptions options;
   options.num_row_blocks = 100000;
   options.num_col_blocks = 10000;
   options.min_row_block_size = 1;
   options.min_col_block_size = 1;
   options.max_row_block_size = 10;
   options.max_col_block_size = 15;
   options.block_density = 5.0 / options.num_col_blocks;
   std::mt19937 prng;

   auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
   Vector x(jacobian->num_rows());
   Vector y(jacobian->num_cols());
   x.setRandom();
   y.setRandom();
   double sum = 0;
   for (auto _ : state) {
     jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
     sum += y.norm();
   }
   CHECK_NE(sum, 0.0);
 }

 BENCHMARK(BM_BlockSparseLeftMultiplyAndAccumulateUnstructured);

 static void BM_CRSRightMultiplyAndAccumulateBA(benchmark::State& state) {
   const int num_threads = static_cast<int>(state.range(0));
   std::mt19937 prng;
   auto bsm_jacobian = CreateFakeBundleAdjustmentJacobian(
       kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);

   auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix();

   ContextImpl context;
   context.EnsureMinimumThreads(num_threads);

   Vector x(jacobian->num_cols());
   Vector y(jacobian->num_rows());
   x.setRandom();
   y.setRandom();
   double sum = 0;
   for (auto _ : state) {
     jacobian->RightMultiplyAndAccumulate(
         x.data(), y.data(), &context, num_threads);
     sum += y.norm();
   }
   CHECK_NE(sum, 0.0);
 }

 BENCHMARK(BM_CRSRightMultiplyAndAccumulateBA)
     ->Arg(1)
     ->Arg(2)
     ->Arg(4)
     ->Arg(8)
     ->Arg(16);

 static void BM_CRSRightMultiplyAndAccumulateUnstructured(
     benchmark::State& state) {
   const int num_threads = static_cast<int>(state.range(0));
   BlockSparseMatrix::RandomMatrixOptions options;
   options.num_row_blocks = kNumRowBlocks;
   options.num_col_blocks = kNumColBlocks;
   options.min_row_block_size = kMinRowBlockSize;
   options.min_col_block_size = kMinColBlockSize;
   options.max_row_block_size = kMaxRowBlockSize;
   options.max_col_block_size = kMaxColBlockSize;
   options.block_density = kBlockDensity;
   std::mt19937 prng;

   auto bsm_jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
   auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix();

   ContextImpl context;
   context.EnsureMinimumThreads(num_threads);

   Vector x(jacobian->num_cols());
   Vector y(jacobian->num_rows());
   x.setRandom();
   y.setRandom();
   double sum = 0;
   for (auto _ : state) {
     jacobian->RightMultiplyAndAccumulate(
         x.data(), y.data(), &context, num_threads);
     sum += y.norm();
   }
   CHECK_NE(sum, 0.0);
 }

 BENCHMARK(BM_CRSRightMultiplyAndAccumulateUnstructured)
     ->Arg(1)
     ->Arg(2)
     ->Arg(4)
     ->Arg(8)
     ->Arg(16);

 static void BM_CRSLeftMultiplyAndAccumulateBA(benchmark::State& state) {
   std::mt19937 prng;
   // Perform setup here
   auto bsm_jacobian = CreateFakeBundleAdjustmentJacobian(
       kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
   auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix();

   Vector x(jacobian->num_rows());
   Vector y(jacobian->num_cols());
   x.setRandom();
   y.setRandom();
   double sum = 0;
   for (auto _ : state) {
     // This code gets timed
     jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
     sum += y.norm();
   }
   CHECK_NE(sum, 0.0);
 }

 BENCHMARK(BM_CRSLeftMultiplyAndAccumulateBA);

 static void BM_CRSLeftMultiplyAndAccumulateUnstructured(
     benchmark::State& state) {
   BlockSparseMatrix::RandomMatrixOptions options;
   options.num_row_blocks = kNumRowBlocks;
   options.num_col_blocks = kNumColBlocks;
   options.min_row_block_size = kMinRowBlockSize;
   options.min_col_block_size = kMinColBlockSize;
   options.max_row_block_size = kMaxRowBlockSize;
   options.max_col_block_size = kMaxColBlockSize;
   options.block_density = kBlockDensity;
   std::mt19937 prng;

   auto bsm_jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
   auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix();

   Vector x(jacobian->num_rows());
   Vector y(jacobian->num_cols());
   x.setRandom();
   y.setRandom();
   double sum = 0;
   for (auto _ : state) {
     // This code gets timed
     jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
     sum += y.norm();
   }
   CHECK_NE(sum, 0.0);
 }

 BENCHMARK(BM_CRSLeftMultiplyAndAccumulateUnstructured);

 #ifndef CERES_NO_CUDA
 static void BM_CudaRightMultiplyAndAccumulateBA(benchmark::State& state) {
   std::mt19937 prng;
   auto jacobian = CreateFakeBundleAdjustmentJacobian(
       kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
   ContextImpl context;
   std::string message;
   context.InitCuda(&message);
   auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix();
   CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs);
   CudaVector cuda_x(&context, 0);
   CudaVector cuda_y(&context, 0);

   Vector x(jacobian->num_cols());
   Vector y(jacobian->num_rows());
   x.setRandom();
   y.setRandom();

   cuda_x.CopyFromCpu(x);
   cuda_y.CopyFromCpu(y);
   double sum = 0;
   for (auto _ : state) {
     cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y);
     sum += cuda_y.Norm();
     CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
   }
   CHECK_NE(sum, 0.0);
 }

 BENCHMARK(BM_CudaRightMultiplyAndAccumulateBA);

 static void BM_CudaRightMultiplyAndAccumulateUnstructured(
     benchmark::State& state) {
   BlockSparseMatrix::RandomMatrixOptions options;
   options.num_row_blocks = kNumRowBlocks;
   options.num_col_blocks = kNumColBlocks;
   options.min_row_block_size = kMinRowBlockSize;
   options.min_col_block_size = kMinColBlockSize;
   options.max_row_block_size = kMaxRowBlockSize;
   options.max_col_block_size = kMaxColBlockSize;
   options.block_density = kBlockDensity;
   std::mt19937 prng;

   auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
   ContextImpl context;
   std::string message;
   context.InitCuda(&message);
   auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix();
   CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs);
   CudaVector cuda_x(&context, 0);
   CudaVector cuda_y(&context, 0);

   Vector x(jacobian->num_cols());
   Vector y(jacobian->num_rows());
   x.setRandom();
   y.setRandom();

   cuda_x.CopyFromCpu(x);
   cuda_y.CopyFromCpu(y);
   double sum = 0;
   for (auto _ : state) {
     cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y);
     sum += cuda_y.Norm();
     CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
   }
   CHECK_NE(sum, 0.0);
 }

 BENCHMARK(BM_CudaRightMultiplyAndAccumulateUnstructured);

 static void BM_CudaLeftMultiplyAndAccumulateBA(benchmark::State& state) {
   std::mt19937 prng;
   auto jacobian = CreateFakeBundleAdjustmentJacobian(
       kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
   ContextImpl context;
   std::string message;
   context.InitCuda(&message);
   auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix();
   CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs);
   CudaVector cuda_x(&context, 0);
   CudaVector cuda_y(&context, 0);

   Vector x(jacobian->num_rows());
   Vector y(jacobian->num_cols());
   x.setRandom();
   y.setRandom();

   cuda_x.CopyFromCpu(x);
   cuda_y.CopyFromCpu(y);
   double sum = 0;
   for (auto _ : state) {
     cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
     sum += cuda_y.Norm();
     CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
   }
   CHECK_NE(sum, 0.0);
 }

 BENCHMARK(BM_CudaLeftMultiplyAndAccumulateBA);

 static void BM_CudaLeftMultiplyAndAccumulateUnstructured(
     benchmark::State& state) {
   BlockSparseMatrix::RandomMatrixOptions options;
   options.num_row_blocks = kNumRowBlocks;
   options.num_col_blocks = kNumColBlocks;
   options.min_row_block_size = kMinRowBlockSize;
   options.min_col_block_size = kMinColBlockSize;
   options.max_row_block_size = kMaxRowBlockSize;
   options.max_col_block_size = kMaxColBlockSize;
   options.block_density = kBlockDensity;
   std::mt19937 prng;

   auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
   ContextImpl context;
   std::string message;
   context.InitCuda(&message);
   auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix();
   CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs);
   CudaVector cuda_x(&context, 0);
   CudaVector cuda_y(&context, 0);

   Vector x(jacobian->num_rows());
   Vector y(jacobian->num_cols());
   x.setRandom();
   y.setRandom();

   cuda_x.CopyFromCpu(x);
   cuda_y.CopyFromCpu(y);
   double sum = 0;
   for (auto _ : state) {
     cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
     sum += cuda_y.Norm();
     CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
   }
   CHECK_NE(sum, 0.0);
 }

 BENCHMARK(BM_CudaLeftMultiplyAndAccumulateUnstructured);

 #endif

 }  // namespace ceres::internal

 BENCHMARK_MAIN();
	// Ceres Solver - A fast non-linear least squares minimizer
	// Copyright 2023 Google Inc. All rights reserved.
	// http://ceres-solver.org/
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are met:
	//
	// * Redistributions of source code must retain the above copyright notice,
	// this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above copyright notice,
	// this list of conditions and the following disclaimer in the documentation
	// and/or other materials provided with the distribution.
	// * Neither the name of Google Inc. nor the names of its contributors may be
	// used to endorse or promote products derived from this software without
	// specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.
	//
	// Authors: joydeepb@cs.utexas.edu (Joydeep Biswas)

	#include <memory>
	#include <random>
	#include <string>

	#include "Eigen/Dense"
	#include "absl/log/check.h"
	#include "benchmark/benchmark.h"
	#include "ceres/block_jacobi_preconditioner.h"
	#include "ceres/block_sparse_matrix.h"
	#include "ceres/context_impl.h"
	#include "ceres/cuda_sparse_matrix.h"
	#include "ceres/cuda_vector.h"
	#include "ceres/fake_bundle_adjustment_jacobian.h"
	#include "ceres/internal/config.h"
	#include "ceres/internal/eigen.h"

	#ifndef CERES_NO_CUDA
	#include "cuda_runtime.h"
	#endif

	namespace ceres::internal {

	constexpr int kNumCameras = 1000;
	constexpr int kNumPoints = 10000;
	constexpr int kCameraSize = 6;
	constexpr int kPointSize = 3;
	constexpr double kVisibility = 0.1;

	constexpr int kNumRowBlocks = 100000;
	constexpr int kNumColBlocks = 10000;
	constexpr int kMinRowBlockSize = 1;
	constexpr int kMaxRowBlockSize = 5;
	constexpr int kMinColBlockSize = 1;
	constexpr int kMaxColBlockSize = 15;
	constexpr double kBlockDensity = 5.0 / kNumColBlocks;

	static void BM_BlockSparseRightMultiplyAndAccumulateBA(
	benchmark::State& state) {
	const int num_threads = static_cast<int>(state.range(0));
	std::mt19937 prng;
	auto jacobian = CreateFakeBundleAdjustmentJacobian(
	kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);

	ContextImpl context;
	context.EnsureMinimumThreads(num_threads);

	Vector x(jacobian->num_cols());
	Vector y(jacobian->num_rows());
	x.setRandom();
	y.setRandom();
	double sum = 0;
	for (auto _ : state) {
	jacobian->RightMultiplyAndAccumulate(
	x.data(), y.data(), &context, num_threads);
	sum += y.norm();
	}
	CHECK_NE(sum, 0.0);
	}

	BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateBA)
	->Arg(1)
	->Arg(2)
	->Arg(4)
	->Arg(8)
	->Arg(16);

	static void BM_BlockSparseRightMultiplyAndAccumulateUnstructured(
	benchmark::State& state) {
	const int num_threads = static_cast<int>(state.range(0));
	BlockSparseMatrix::RandomMatrixOptions options;
	options.num_row_blocks = kNumRowBlocks;
	options.num_col_blocks = kNumColBlocks;
	options.min_row_block_size = kMinRowBlockSize;
	options.min_col_block_size = kMinColBlockSize;
	options.max_row_block_size = kMaxRowBlockSize;
	options.max_col_block_size = kMaxColBlockSize;
	options.block_density = kBlockDensity;
	std::mt19937 prng;

	auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);

	ContextImpl context;
	context.EnsureMinimumThreads(num_threads);

	Vector x(jacobian->num_cols());
	Vector y(jacobian->num_rows());
	x.setRandom();
	y.setRandom();
	double sum = 0;
	for (auto _ : state) {
	jacobian->RightMultiplyAndAccumulate(
	x.data(), y.data(), &context, num_threads);
	sum += y.norm();
	}
	CHECK_NE(sum, 0.0);
	}

	BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateUnstructured)
	->Arg(1)
	->Arg(2)
	->Arg(4)
	->Arg(8)
	->Arg(16);

	static void BM_BlockSparseLeftMultiplyAndAccumulateBA(benchmark::State& state) {
	std::mt19937 prng;
	auto jacobian = CreateFakeBundleAdjustmentJacobian(
	kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
	Vector x(jacobian->num_rows());
	Vector y(jacobian->num_cols());
	x.setRandom();
	y.setRandom();
	double sum = 0;
	for (auto _ : state) {
	jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
	sum += y.norm();
	}
	CHECK_NE(sum, 0.0);
	}

	BENCHMARK(BM_BlockSparseLeftMultiplyAndAccumulateBA);

	static void BM_BlockSparseLeftMultiplyAndAccumulateUnstructured(
	benchmark::State& state) {
	BlockSparseMatrix::RandomMatrixOptions options;
	options.num_row_blocks = 100000;
	options.num_col_blocks = 10000;
	options.min_row_block_size = 1;
	options.min_col_block_size = 1;
	options.max_row_block_size = 10;
	options.max_col_block_size = 15;
	options.block_density = 5.0 / options.num_col_blocks;
	std::mt19937 prng;

	auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
	Vector x(jacobian->num_rows());
	Vector y(jacobian->num_cols());
	x.setRandom();
	y.setRandom();
	double sum = 0;
	for (auto _ : state) {
	jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
	sum += y.norm();
	}
	CHECK_NE(sum, 0.0);
	}

	BENCHMARK(BM_BlockSparseLeftMultiplyAndAccumulateUnstructured);

	static void BM_CRSRightMultiplyAndAccumulateBA(benchmark::State& state) {
	const int num_threads = static_cast<int>(state.range(0));
	std::mt19937 prng;
	auto bsm_jacobian = CreateFakeBundleAdjustmentJacobian(
	kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);

	auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix();

	ContextImpl context;
	context.EnsureMinimumThreads(num_threads);

	Vector x(jacobian->num_cols());
	Vector y(jacobian->num_rows());
	x.setRandom();
	y.setRandom();
	double sum = 0;
	for (auto _ : state) {
	jacobian->RightMultiplyAndAccumulate(
	x.data(), y.data(), &context, num_threads);
	sum += y.norm();
	}
	CHECK_NE(sum, 0.0);
	}

	BENCHMARK(BM_CRSRightMultiplyAndAccumulateBA)
	->Arg(1)
	->Arg(2)
	->Arg(4)
	->Arg(8)
	->Arg(16);

	static void BM_CRSRightMultiplyAndAccumulateUnstructured(
	benchmark::State& state) {
	const int num_threads = static_cast<int>(state.range(0));
	BlockSparseMatrix::RandomMatrixOptions options;
	options.num_row_blocks = kNumRowBlocks;
	options.num_col_blocks = kNumColBlocks;
	options.min_row_block_size = kMinRowBlockSize;
	options.min_col_block_size = kMinColBlockSize;
	options.max_row_block_size = kMaxRowBlockSize;
	options.max_col_block_size = kMaxColBlockSize;
	options.block_density = kBlockDensity;
	std::mt19937 prng;

	auto bsm_jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
	auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix();

	ContextImpl context;
	context.EnsureMinimumThreads(num_threads);

	Vector x(jacobian->num_cols());
	Vector y(jacobian->num_rows());
	x.setRandom();
	y.setRandom();
	double sum = 0;
	for (auto _ : state) {
	jacobian->RightMultiplyAndAccumulate(
	x.data(), y.data(), &context, num_threads);
	sum += y.norm();
	}
	CHECK_NE(sum, 0.0);
	}

	BENCHMARK(BM_CRSRightMultiplyAndAccumulateUnstructured)
	->Arg(1)
	->Arg(2)
	->Arg(4)
	->Arg(8)
	->Arg(16);

	static void BM_CRSLeftMultiplyAndAccumulateBA(benchmark::State& state) {
	std::mt19937 prng;
	// Perform setup here
	auto bsm_jacobian = CreateFakeBundleAdjustmentJacobian(
	kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
	auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix();

	Vector x(jacobian->num_rows());
	Vector y(jacobian->num_cols());
	x.setRandom();
	y.setRandom();
	double sum = 0;
	for (auto _ : state) {
	// This code gets timed
	jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
	sum += y.norm();
	}
	CHECK_NE(sum, 0.0);
	}

	BENCHMARK(BM_CRSLeftMultiplyAndAccumulateBA);

	static void BM_CRSLeftMultiplyAndAccumulateUnstructured(
	benchmark::State& state) {
	BlockSparseMatrix::RandomMatrixOptions options;
	options.num_row_blocks = kNumRowBlocks;
	options.num_col_blocks = kNumColBlocks;
	options.min_row_block_size = kMinRowBlockSize;
	options.min_col_block_size = kMinColBlockSize;
	options.max_row_block_size = kMaxRowBlockSize;
	options.max_col_block_size = kMaxColBlockSize;
	options.block_density = kBlockDensity;
	std::mt19937 prng;

	auto bsm_jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
	auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix();

	Vector x(jacobian->num_rows());
	Vector y(jacobian->num_cols());
	x.setRandom();
	y.setRandom();
	double sum = 0;
	for (auto _ : state) {
	// This code gets timed
	jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
	sum += y.norm();
	}
	CHECK_NE(sum, 0.0);
	}

	BENCHMARK(BM_CRSLeftMultiplyAndAccumulateUnstructured);

	#ifndef CERES_NO_CUDA
	static void BM_CudaRightMultiplyAndAccumulateBA(benchmark::State& state) {
	std::mt19937 prng;
	auto jacobian = CreateFakeBundleAdjustmentJacobian(
	kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
	ContextImpl context;
	std::string message;
	context.InitCuda(&message);
	auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix();
	CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs);
	CudaVector cuda_x(&context, 0);
	CudaVector cuda_y(&context, 0);

	Vector x(jacobian->num_cols());
	Vector y(jacobian->num_rows());
	x.setRandom();
	y.setRandom();

	cuda_x.CopyFromCpu(x);
	cuda_y.CopyFromCpu(y);
	double sum = 0;
	for (auto _ : state) {
	cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y);
	sum += cuda_y.Norm();
	CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
	}
	CHECK_NE(sum, 0.0);
	}

	BENCHMARK(BM_CudaRightMultiplyAndAccumulateBA);

	static void BM_CudaRightMultiplyAndAccumulateUnstructured(
	benchmark::State& state) {
	BlockSparseMatrix::RandomMatrixOptions options;
	options.num_row_blocks = kNumRowBlocks;
	options.num_col_blocks = kNumColBlocks;
	options.min_row_block_size = kMinRowBlockSize;
	options.min_col_block_size = kMinColBlockSize;
	options.max_row_block_size = kMaxRowBlockSize;
	options.max_col_block_size = kMaxColBlockSize;
	options.block_density = kBlockDensity;
	std::mt19937 prng;

	auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
	ContextImpl context;
	std::string message;
	context.InitCuda(&message);
	auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix();
	CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs);
	CudaVector cuda_x(&context, 0);
	CudaVector cuda_y(&context, 0);

	Vector x(jacobian->num_cols());
	Vector y(jacobian->num_rows());
	x.setRandom();
	y.setRandom();

	cuda_x.CopyFromCpu(x);
	cuda_y.CopyFromCpu(y);
	double sum = 0;
	for (auto _ : state) {
	cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y);
	sum += cuda_y.Norm();
	CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
	}
	CHECK_NE(sum, 0.0);
	}

	BENCHMARK(BM_CudaRightMultiplyAndAccumulateUnstructured);

	static void BM_CudaLeftMultiplyAndAccumulateBA(benchmark::State& state) {
	std::mt19937 prng;
	auto jacobian = CreateFakeBundleAdjustmentJacobian(
	kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
	ContextImpl context;
	std::string message;
	context.InitCuda(&message);
	auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix();
	CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs);
	CudaVector cuda_x(&context, 0);
	CudaVector cuda_y(&context, 0);

	Vector x(jacobian->num_rows());
	Vector y(jacobian->num_cols());
	x.setRandom();
	y.setRandom();

	cuda_x.CopyFromCpu(x);
	cuda_y.CopyFromCpu(y);
	double sum = 0;
	for (auto _ : state) {
	cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
	sum += cuda_y.Norm();
	CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
	}
	CHECK_NE(sum, 0.0);
	}

	BENCHMARK(BM_CudaLeftMultiplyAndAccumulateBA);

	static void BM_CudaLeftMultiplyAndAccumulateUnstructured(
	benchmark::State& state) {
	BlockSparseMatrix::RandomMatrixOptions options;
	options.num_row_blocks = kNumRowBlocks;
	options.num_col_blocks = kNumColBlocks;
	options.min_row_block_size = kMinRowBlockSize;
	options.min_col_block_size = kMinColBlockSize;
	options.max_row_block_size = kMaxRowBlockSize;
	options.max_col_block_size = kMaxColBlockSize;
	options.block_density = kBlockDensity;
	std::mt19937 prng;

	auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
	ContextImpl context;
	std::string message;
	context.InitCuda(&message);
	auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix();
	CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs);
	CudaVector cuda_x(&context, 0);
	CudaVector cuda_y(&context, 0);

	Vector x(jacobian->num_rows());
	Vector y(jacobian->num_cols());
	x.setRandom();
	y.setRandom();

	cuda_x.CopyFromCpu(x);
	cuda_y.CopyFromCpu(y);
	double sum = 0;
	for (auto _ : state) {
	cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
	sum += cuda_y.Norm();
	CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
	}
	CHECK_NE(sum, 0.0);
	}

	BENCHMARK(BM_CudaLeftMultiplyAndAccumulateUnstructured);

	#endif

	} // namespace ceres::internal

	BENCHMARK_MAIN();