|  | // Ceres Solver - A fast non-linear least squares minimizer | 
|  | // Copyright 2022 Google Inc. All rights reserved. | 
|  | // http://ceres-solver.org/ | 
|  | // | 
|  | // Redistribution and use in source and binary forms, with or without | 
|  | // modification, are permitted provided that the following conditions are met: | 
|  | // | 
|  | // * Redistributions of source code must retain the above copyright notice, | 
|  | //   this list of conditions and the following disclaimer. | 
|  | // * Redistributions in binary form must reproduce the above copyright notice, | 
|  | //   this list of conditions and the following disclaimer in the documentation | 
|  | //   and/or other materials provided with the distribution. | 
|  | // * Neither the name of Google Inc. nor the names of its contributors may be | 
|  | //   used to endorse or promote products derived from this software without | 
|  | //   specific prior written permission. | 
|  | // | 
|  | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
|  | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
|  | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
|  | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
|  | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
|  | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
|  | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
|  | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
|  | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
|  | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
|  | // POSSIBILITY OF SUCH DAMAGE. | 
|  | // | 
|  | // Authors: joydeepb@cs.utexas.edu (Joydeep Biswas) | 
|  |  | 
|  | #include <memory> | 
|  | #include <random> | 
|  | #include <string> | 
|  |  | 
|  | #include "Eigen/Dense" | 
|  | #include "benchmark/benchmark.h" | 
|  | #include "ceres/block_jacobi_preconditioner.h" | 
|  | #include "ceres/block_sparse_matrix.h" | 
|  | #include "ceres/context_impl.h" | 
|  | #include "ceres/cuda_sparse_matrix.h" | 
|  | #include "ceres/cuda_vector.h" | 
|  | #include "ceres/fake_bundle_adjustment_jacobian.h" | 
|  | #include "ceres/internal/config.h" | 
|  | #include "ceres/internal/eigen.h" | 
|  | #include "ceres/linear_solver.h" | 
|  |  | 
|  | #ifndef CERES_NO_CUDA | 
|  | #include "cuda_runtime.h" | 
|  | #endif | 
|  |  | 
|  | namespace ceres::internal { | 
|  |  | 
|  | constexpr int kNumCameras = 1000; | 
|  | constexpr int kNumPoints = 10000; | 
|  | constexpr int kCameraSize = 6; | 
|  | constexpr int kPointSize = 3; | 
|  | constexpr double kVisibility = 0.1; | 
|  |  | 
|  | constexpr int kNumRowBlocks = 100000; | 
|  | constexpr int kNumColBlocks = 10000; | 
|  | constexpr int kMinRowBlockSize = 1; | 
|  | constexpr int kMaxRowBlockSize = 5; | 
|  | constexpr int kMinColBlockSize = 1; | 
|  | constexpr int kMaxColBlockSize = 15; | 
|  | constexpr double kBlockDensity = 5.0 / kNumColBlocks; | 
|  |  | 
|  | static void BM_BlockSparseRightMultiplyAndAccumulateBA( | 
|  | benchmark::State& state) { | 
|  | const int num_threads = static_cast<int>(state.range(0)); | 
|  | std::mt19937 prng; | 
|  | auto jacobian = CreateFakeBundleAdjustmentJacobian( | 
|  | kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); | 
|  |  | 
|  | ContextImpl context; | 
|  | context.EnsureMinimumThreads(num_threads); | 
|  |  | 
|  | Vector x(jacobian->num_cols()); | 
|  | Vector y(jacobian->num_rows()); | 
|  | x.setRandom(); | 
|  | y.setRandom(); | 
|  | double sum = 0; | 
|  | for (auto _ : state) { | 
|  | jacobian->RightMultiplyAndAccumulate( | 
|  | x.data(), y.data(), &context, num_threads); | 
|  | sum += y.norm(); | 
|  | } | 
|  | CHECK_NE(sum, 0.0); | 
|  | } | 
|  |  | 
|  | BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateBA) | 
|  | ->Arg(1) | 
|  | ->Arg(2) | 
|  | ->Arg(4) | 
|  | ->Arg(8) | 
|  | ->Arg(16); | 
|  |  | 
|  | static void BM_BlockSparseRightMultiplyAndAccumulateUnstructured( | 
|  | benchmark::State& state) { | 
|  | const int num_threads = static_cast<int>(state.range(0)); | 
|  | BlockSparseMatrix::RandomMatrixOptions options; | 
|  | options.num_row_blocks = kNumRowBlocks; | 
|  | options.num_col_blocks = kNumColBlocks; | 
|  | options.min_row_block_size = kMinRowBlockSize; | 
|  | options.min_col_block_size = kMinColBlockSize; | 
|  | options.max_row_block_size = kMaxRowBlockSize; | 
|  | options.max_col_block_size = kMaxColBlockSize; | 
|  | options.block_density = kBlockDensity; | 
|  | std::mt19937 prng; | 
|  |  | 
|  | auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng); | 
|  |  | 
|  | ContextImpl context; | 
|  | context.EnsureMinimumThreads(num_threads); | 
|  |  | 
|  | Vector x(jacobian->num_cols()); | 
|  | Vector y(jacobian->num_rows()); | 
|  | x.setRandom(); | 
|  | y.setRandom(); | 
|  | double sum = 0; | 
|  | for (auto _ : state) { | 
|  | jacobian->RightMultiplyAndAccumulate( | 
|  | x.data(), y.data(), &context, num_threads); | 
|  | sum += y.norm(); | 
|  | } | 
|  | CHECK_NE(sum, 0.0); | 
|  | } | 
|  |  | 
|  | BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateUnstructured) | 
|  | ->Arg(1) | 
|  | ->Arg(2) | 
|  | ->Arg(4) | 
|  | ->Arg(8) | 
|  | ->Arg(16); | 
|  |  | 
|  | static void BM_BlockSparseLeftMultiplyAndAccumulateBA(benchmark::State& state) { | 
|  | std::mt19937 prng; | 
|  | auto jacobian = CreateFakeBundleAdjustmentJacobian( | 
|  | kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); | 
|  | Vector x(jacobian->num_rows()); | 
|  | Vector y(jacobian->num_cols()); | 
|  | x.setRandom(); | 
|  | y.setRandom(); | 
|  | double sum = 0; | 
|  | for (auto _ : state) { | 
|  | jacobian->LeftMultiplyAndAccumulate(x.data(), y.data()); | 
|  | sum += y.norm(); | 
|  | } | 
|  | CHECK_NE(sum, 0.0); | 
|  | } | 
|  |  | 
|  | BENCHMARK(BM_BlockSparseLeftMultiplyAndAccumulateBA); | 
|  |  | 
|  | static void BM_BlockSparseLeftMultiplyAndAccumulateUnstructured( | 
|  | benchmark::State& state) { | 
|  | BlockSparseMatrix::RandomMatrixOptions options; | 
|  | options.num_row_blocks = 100000; | 
|  | options.num_col_blocks = 10000; | 
|  | options.min_row_block_size = 1; | 
|  | options.min_col_block_size = 1; | 
|  | options.max_row_block_size = 10; | 
|  | options.max_col_block_size = 15; | 
|  | options.block_density = 5.0 / options.num_col_blocks; | 
|  | std::mt19937 prng; | 
|  |  | 
|  | auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng); | 
|  | Vector x(jacobian->num_rows()); | 
|  | Vector y(jacobian->num_cols()); | 
|  | x.setRandom(); | 
|  | y.setRandom(); | 
|  | double sum = 0; | 
|  | for (auto _ : state) { | 
|  | jacobian->LeftMultiplyAndAccumulate(x.data(), y.data()); | 
|  | sum += y.norm(); | 
|  | } | 
|  | CHECK_NE(sum, 0.0); | 
|  | } | 
|  |  | 
|  | BENCHMARK(BM_BlockSparseLeftMultiplyAndAccumulateUnstructured); | 
|  |  | 
|  | static void BM_CRSRightMultiplyAndAccumulateBA(benchmark::State& state) { | 
|  | const int num_threads = static_cast<int>(state.range(0)); | 
|  | std::mt19937 prng; | 
|  | auto bsm_jacobian = CreateFakeBundleAdjustmentJacobian( | 
|  | kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); | 
|  |  | 
|  | auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix(); | 
|  |  | 
|  | ContextImpl context; | 
|  | context.EnsureMinimumThreads(num_threads); | 
|  |  | 
|  | Vector x(jacobian->num_cols()); | 
|  | Vector y(jacobian->num_rows()); | 
|  | x.setRandom(); | 
|  | y.setRandom(); | 
|  | double sum = 0; | 
|  | for (auto _ : state) { | 
|  | jacobian->RightMultiplyAndAccumulate( | 
|  | x.data(), y.data(), &context, num_threads); | 
|  | sum += y.norm(); | 
|  | } | 
|  | CHECK_NE(sum, 0.0); | 
|  | } | 
|  |  | 
|  | BENCHMARK(BM_CRSRightMultiplyAndAccumulateBA) | 
|  | ->Arg(1) | 
|  | ->Arg(2) | 
|  | ->Arg(4) | 
|  | ->Arg(8) | 
|  | ->Arg(16); | 
|  |  | 
|  | static void BM_CRSRightMultiplyAndAccumulateUnstructured( | 
|  | benchmark::State& state) { | 
|  | const int num_threads = static_cast<int>(state.range(0)); | 
|  | BlockSparseMatrix::RandomMatrixOptions options; | 
|  | options.num_row_blocks = kNumRowBlocks; | 
|  | options.num_col_blocks = kNumColBlocks; | 
|  | options.min_row_block_size = kMinRowBlockSize; | 
|  | options.min_col_block_size = kMinColBlockSize; | 
|  | options.max_row_block_size = kMaxRowBlockSize; | 
|  | options.max_col_block_size = kMaxColBlockSize; | 
|  | options.block_density = kBlockDensity; | 
|  | std::mt19937 prng; | 
|  |  | 
|  | auto bsm_jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng); | 
|  | auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix(); | 
|  |  | 
|  | ContextImpl context; | 
|  | context.EnsureMinimumThreads(num_threads); | 
|  |  | 
|  | Vector x(jacobian->num_cols()); | 
|  | Vector y(jacobian->num_rows()); | 
|  | x.setRandom(); | 
|  | y.setRandom(); | 
|  | double sum = 0; | 
|  | for (auto _ : state) { | 
|  | jacobian->RightMultiplyAndAccumulate( | 
|  | x.data(), y.data(), &context, num_threads); | 
|  | sum += y.norm(); | 
|  | } | 
|  | CHECK_NE(sum, 0.0); | 
|  | } | 
|  |  | 
|  | BENCHMARK(BM_CRSRightMultiplyAndAccumulateUnstructured) | 
|  | ->Arg(1) | 
|  | ->Arg(2) | 
|  | ->Arg(4) | 
|  | ->Arg(8) | 
|  | ->Arg(16); | 
|  |  | 
|  | static void BM_CRSLeftMultiplyAndAccumulateBA(benchmark::State& state) { | 
|  | std::mt19937 prng; | 
|  | // Perform setup here | 
|  | auto bsm_jacobian = CreateFakeBundleAdjustmentJacobian( | 
|  | kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); | 
|  | auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix(); | 
|  |  | 
|  | Vector x(jacobian->num_rows()); | 
|  | Vector y(jacobian->num_cols()); | 
|  | x.setRandom(); | 
|  | y.setRandom(); | 
|  | double sum = 0; | 
|  | for (auto _ : state) { | 
|  | // This code gets timed | 
|  | jacobian->LeftMultiplyAndAccumulate(x.data(), y.data()); | 
|  | sum += y.norm(); | 
|  | } | 
|  | CHECK_NE(sum, 0.0); | 
|  | } | 
|  |  | 
|  | BENCHMARK(BM_CRSLeftMultiplyAndAccumulateBA); | 
|  |  | 
|  | static void BM_CRSLeftMultiplyAndAccumulateUnstructured( | 
|  | benchmark::State& state) { | 
|  | BlockSparseMatrix::RandomMatrixOptions options; | 
|  | options.num_row_blocks = kNumRowBlocks; | 
|  | options.num_col_blocks = kNumColBlocks; | 
|  | options.min_row_block_size = kMinRowBlockSize; | 
|  | options.min_col_block_size = kMinColBlockSize; | 
|  | options.max_row_block_size = kMaxRowBlockSize; | 
|  | options.max_col_block_size = kMaxColBlockSize; | 
|  | options.block_density = kBlockDensity; | 
|  | std::mt19937 prng; | 
|  |  | 
|  | auto bsm_jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng); | 
|  | auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix(); | 
|  |  | 
|  | Vector x(jacobian->num_rows()); | 
|  | Vector y(jacobian->num_cols()); | 
|  | x.setRandom(); | 
|  | y.setRandom(); | 
|  | double sum = 0; | 
|  | for (auto _ : state) { | 
|  | // This code gets timed | 
|  | jacobian->LeftMultiplyAndAccumulate(x.data(), y.data()); | 
|  | sum += y.norm(); | 
|  | } | 
|  | CHECK_NE(sum, 0.0); | 
|  | } | 
|  |  | 
|  | BENCHMARK(BM_CRSLeftMultiplyAndAccumulateUnstructured); | 
|  |  | 
|  | #ifndef CERES_NO_CUDA | 
|  | static void BM_CudaRightMultiplyAndAccumulateBA(benchmark::State& state) { | 
|  | std::mt19937 prng; | 
|  | auto jacobian = CreateFakeBundleAdjustmentJacobian( | 
|  | kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); | 
|  | ContextImpl context; | 
|  | std::string message; | 
|  | context.InitCuda(&message); | 
|  | auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix(); | 
|  | CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs); | 
|  | CudaVector cuda_x(&context, 0); | 
|  | CudaVector cuda_y(&context, 0); | 
|  |  | 
|  | Vector x(jacobian->num_cols()); | 
|  | Vector y(jacobian->num_rows()); | 
|  | x.setRandom(); | 
|  | y.setRandom(); | 
|  |  | 
|  | cuda_x.CopyFromCpu(x); | 
|  | cuda_y.CopyFromCpu(y); | 
|  | double sum = 0; | 
|  | for (auto _ : state) { | 
|  | cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y); | 
|  | sum += cuda_y.Norm(); | 
|  | CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess); | 
|  | } | 
|  | CHECK_NE(sum, 0.0); | 
|  | } | 
|  |  | 
|  | BENCHMARK(BM_CudaRightMultiplyAndAccumulateBA); | 
|  |  | 
|  | static void BM_CudaRightMultiplyAndAccumulateUnstructured( | 
|  | benchmark::State& state) { | 
|  | BlockSparseMatrix::RandomMatrixOptions options; | 
|  | options.num_row_blocks = kNumRowBlocks; | 
|  | options.num_col_blocks = kNumColBlocks; | 
|  | options.min_row_block_size = kMinRowBlockSize; | 
|  | options.min_col_block_size = kMinColBlockSize; | 
|  | options.max_row_block_size = kMaxRowBlockSize; | 
|  | options.max_col_block_size = kMaxColBlockSize; | 
|  | options.block_density = kBlockDensity; | 
|  | std::mt19937 prng; | 
|  |  | 
|  | auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng); | 
|  | ContextImpl context; | 
|  | std::string message; | 
|  | context.InitCuda(&message); | 
|  | auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix(); | 
|  | CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs); | 
|  | CudaVector cuda_x(&context, 0); | 
|  | CudaVector cuda_y(&context, 0); | 
|  |  | 
|  | Vector x(jacobian->num_cols()); | 
|  | Vector y(jacobian->num_rows()); | 
|  | x.setRandom(); | 
|  | y.setRandom(); | 
|  |  | 
|  | cuda_x.CopyFromCpu(x); | 
|  | cuda_y.CopyFromCpu(y); | 
|  | double sum = 0; | 
|  | for (auto _ : state) { | 
|  | cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y); | 
|  | sum += cuda_y.Norm(); | 
|  | CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess); | 
|  | } | 
|  | CHECK_NE(sum, 0.0); | 
|  | } | 
|  |  | 
|  | BENCHMARK(BM_CudaRightMultiplyAndAccumulateUnstructured); | 
|  |  | 
|  | static void BM_CudaLeftMultiplyAndAccumulateBA(benchmark::State& state) { | 
|  | std::mt19937 prng; | 
|  | auto jacobian = CreateFakeBundleAdjustmentJacobian( | 
|  | kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); | 
|  | ContextImpl context; | 
|  | std::string message; | 
|  | context.InitCuda(&message); | 
|  | auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix(); | 
|  | CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs); | 
|  | CudaVector cuda_x(&context, 0); | 
|  | CudaVector cuda_y(&context, 0); | 
|  |  | 
|  | Vector x(jacobian->num_rows()); | 
|  | Vector y(jacobian->num_cols()); | 
|  | x.setRandom(); | 
|  | y.setRandom(); | 
|  |  | 
|  | cuda_x.CopyFromCpu(x); | 
|  | cuda_y.CopyFromCpu(y); | 
|  | double sum = 0; | 
|  | for (auto _ : state) { | 
|  | cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y); | 
|  | sum += cuda_y.Norm(); | 
|  | CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess); | 
|  | } | 
|  | CHECK_NE(sum, 0.0); | 
|  | } | 
|  |  | 
|  | BENCHMARK(BM_CudaLeftMultiplyAndAccumulateBA); | 
|  |  | 
|  | static void BM_CudaLeftMultiplyAndAccumulateUnstructured( | 
|  | benchmark::State& state) { | 
|  | BlockSparseMatrix::RandomMatrixOptions options; | 
|  | options.num_row_blocks = kNumRowBlocks; | 
|  | options.num_col_blocks = kNumColBlocks; | 
|  | options.min_row_block_size = kMinRowBlockSize; | 
|  | options.min_col_block_size = kMinColBlockSize; | 
|  | options.max_row_block_size = kMaxRowBlockSize; | 
|  | options.max_col_block_size = kMaxColBlockSize; | 
|  | options.block_density = kBlockDensity; | 
|  | std::mt19937 prng; | 
|  |  | 
|  | auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng); | 
|  | ContextImpl context; | 
|  | std::string message; | 
|  | context.InitCuda(&message); | 
|  | auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix(); | 
|  | CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs); | 
|  | CudaVector cuda_x(&context, 0); | 
|  | CudaVector cuda_y(&context, 0); | 
|  |  | 
|  | Vector x(jacobian->num_rows()); | 
|  | Vector y(jacobian->num_cols()); | 
|  | x.setRandom(); | 
|  | y.setRandom(); | 
|  |  | 
|  | cuda_x.CopyFromCpu(x); | 
|  | cuda_y.CopyFromCpu(y); | 
|  | double sum = 0; | 
|  | for (auto _ : state) { | 
|  | cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y); | 
|  | sum += cuda_y.Norm(); | 
|  | CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess); | 
|  | } | 
|  | CHECK_NE(sum, 0.0); | 
|  | } | 
|  |  | 
|  | BENCHMARK(BM_CudaLeftMultiplyAndAccumulateUnstructured); | 
|  |  | 
|  | #endif | 
|  |  | 
|  | }  // namespace ceres::internal | 
|  |  | 
|  | BENCHMARK_MAIN(); |