CUDA CGNR, Part 3: CudaSparseMatrix * Added CudaSparseMatrix to manage and operate on sparse matrices with cuSparse. * Added tests for CudaSparseMatrix. * Added a new sparse linear operator benchmark. Change-Id: Id09df46de3b40be1f14441528088b54dab5844af
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt index 02916b1..dae16b5 100644 --- a/internal/ceres/CMakeLists.txt +++ b/internal/ceres/CMakeLists.txt
@@ -198,6 +198,7 @@ cost_function.cc covariance.cc covariance_impl.cc + cuda_sparse_matrix.cc cuda_vector.cc dense_cholesky.cc dense_normal_cholesky_solver.cc @@ -484,6 +485,7 @@ ceres_test(cuda_dense_cholesky) ceres_test(cuda_dense_qr) ceres_test(cuda_kernels) + ceres_test(cuda_sparse_matrix) ceres_test(cuda_vector) ceres_test(dense_linear_solver) ceres_test(dense_cholesky) @@ -589,5 +591,9 @@ add_executable(dense_linear_solver_benchmark dense_linear_solver_benchmark.cc) add_dependencies_to_benchmark(dense_linear_solver_benchmark) + add_executable(sparse_linear_operator_benchmark + sparse_linear_operator_benchmark.cc) + add_dependencies_to_benchmark(sparse_linear_operator_benchmark) + add_subdirectory(autodiff_benchmarks) endif (BUILD_BENCHMARKS)
diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc new file mode 100644 index 0000000..2f857e2 --- /dev/null +++ b/internal/ceres/cuda_sparse_matrix.cc
@@ -0,0 +1,152 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2022 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Author: joydeepb@cs.utexas.edu (Joydeep Biswas) +// +// A CUDA sparse matrix linear operator. + +// This include must come before any #ifndef check on Ceres compile options. +// clang-format off +#include "ceres/internal/config.h" +// clang-format on + +#include "ceres/cuda_sparse_matrix.h" + +#include <math.h> +#include <memory> + +#include "ceres/internal/export.h" +#include "ceres/block_sparse_matrix.h" +#include "ceres/compressed_row_sparse_matrix.h" +#include "ceres/crs_matrix.h" +#include "ceres/types.h" +#include "ceres/context_impl.h" +#include "ceres/wall_time.h" + +#ifndef CERES_NO_CUDA + +#include "ceres/cuda_buffer.h" +#include "ceres/cuda_vector.h" +#include "ceres/ceres_cuda_kernels.h" +#include "cusparse.h" + + +namespace ceres::internal { + +CudaSparseMatrix::CudaSparseMatrix( + ContextImpl* context, + const CompressedRowSparseMatrix& crs_matrix) { + DCHECK_NE(context, nullptr); + CHECK(context->IsCUDAInitialized()); + context_ = context; + num_rows_ = crs_matrix.num_rows(); + num_cols_ = crs_matrix.num_cols(); + num_nonzeros_ = crs_matrix.num_nonzeros(); + rows_.CopyFromCpu( + crs_matrix.rows(), num_rows_ + 1, context_->stream_); + cols_.CopyFromCpu( + crs_matrix.cols(), num_nonzeros_, context_->stream_); + values_.CopyFromCpu( + crs_matrix.values(), num_nonzeros_, context_->stream_); + cusparseCreateCsr(&descr_, + num_rows_, + num_cols_, + num_nonzeros_, + rows_.data(), + cols_.data(), + values_.data(), + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F); +} + +CudaSparseMatrix::~CudaSparseMatrix() { + CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS); + descr_ = nullptr; +} + +void CudaSparseMatrix::CopyValuesFromCpu( + const CompressedRowSparseMatrix& crs_matrix) { + // There is no quick and easy way to verify that the structure is unchanged, + // but at least we can check that the size of the matrix and the number of + // nonzeros is unchanged. + CHECK_EQ(num_rows_, crs_matrix.num_rows()); + CHECK_EQ(num_cols_, crs_matrix.num_cols()); + CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros()); + values_.CopyFromCpu( + crs_matrix.values(), num_nonzeros_, context_->stream_); +} + +void CudaSparseMatrix::SpMv(cusparseOperation_t op, + const CudaVector& x, + CudaVector* y) { + size_t buffer_size = 0; + const double alpha = 1.0; + const double beta = 1.0; + + CHECK_EQ(cusparseSpMV_bufferSize(context_->cusparse_handle_, + op, + &alpha, + descr_, + x.descr(), + &beta, + y->descr(), + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, + &buffer_size), + CUSPARSE_STATUS_SUCCESS); + spmv_buffer_.Reserve(buffer_size); + CHECK_EQ(cusparseSpMV(context_->cusparse_handle_, + op, + &alpha, + descr_, + x.descr(), + &beta, + y->descr(), + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, + spmv_buffer_.data()), + CUSPARSE_STATUS_SUCCESS); +} + +void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) { + SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x, y); +} + +void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) { + // TODO(Joydeep Biswas): We should consider storing a transposed copy of the + // matrix by converting CSR to CSC. From the cuSPARSE documentation: + // "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA + // != CUSPARSE_OPERATION_NON_TRANSPOSE" + SpMv(CUSPARSE_OPERATION_TRANSPOSE, x, y); +} + +} // namespace ceres::internal + +#endif // CERES_NO_CUDA \ No newline at end of file
diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h new file mode 100644 index 0000000..7661fb9 --- /dev/null +++ b/internal/ceres/cuda_sparse_matrix.h
@@ -0,0 +1,120 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2022 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Author: joydeepb@cs.utexas.edu (Joydeep Biswas) +// +// A CUDA sparse matrix linear operator. + +#ifndef CERES_INTERNAL_CUDA_SPARSE_MATRIX_H_ +#define CERES_INTERNAL_CUDA_SPARSE_MATRIX_H_ + +// This include must come before any #ifndef check on Ceres compile options. +// clang-format off +#include "ceres/internal/config.h" +// clang-format on + +#include <cstdint> +#include <memory> +#include <string> + +#include "ceres/compressed_row_sparse_matrix.h" +#include "ceres/internal/export.h" +#include "ceres/types.h" +#include "ceres/context_impl.h" + +#ifndef CERES_NO_CUDA +#include "ceres/cuda_buffer.h" +#include "ceres/cuda_vector.h" +#include "cusparse.h" + +namespace ceres::internal { + +// A sparse matrix hosted on the GPU in compressed row sparse format, with +// CUDA-accelerated operations. +class CERES_NO_EXPORT CudaSparseMatrix { + public: + + // Create a GPU copy of the matrix provided. The caller must ensure that + // InitCuda() has already been successfully called on context before calling + // this constructor. + CudaSparseMatrix(ContextImpl* context, + const CompressedRowSparseMatrix& crs_matrix); + + ~CudaSparseMatrix(); + + // y = y + Ax; + void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y); + // y = y + A'x; + void LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y); + + int num_rows() const { return num_rows_; } + int num_cols() const { return num_cols_; } + int num_nonzeros() const { return num_nonzeros_; } + + // If subsequent uses of this matrix involve only numerical changes and no + // structural changes, then this method can be used to copy the updated + // non-zero values -- the row and column index arrays are kept the same. It + // is the caller's responsibility to ensure that the sparsity structure of the + // matrix is unchanged. + void CopyValuesFromCpu(const CompressedRowSparseMatrix& crs_matrix); + + const cusparseSpMatDescr_t& descr() const { return descr_; } + + private: + + // Disable copy and assignment. + CudaSparseMatrix(const CudaSparseMatrix&) = delete; + CudaSparseMatrix& operator=(const CudaSparseMatrix&) = delete; + + // y = y + op(M)x. op must be either CUSPARSE_OPERATION_NON_TRANSPOSE or + // CUSPARSE_OPERATION_TRANSPOSE. + void SpMv(cusparseOperation_t op, const CudaVector& x, CudaVector* y); + + int num_rows_ = 0; + int num_cols_ = 0; + int num_nonzeros_ = 0; + + // CSR row indices. + CudaBuffer<int32_t> rows_; + // CSR column indices. + CudaBuffer<int32_t> cols_; + // CSR values. + CudaBuffer<double> values_; + + ContextImpl* context_ = nullptr; + + // CuSparse object that describes this matrix. + cusparseSpMatDescr_t descr_ = nullptr; + + CudaBuffer<uint8_t> spmv_buffer_; +}; + +} // namespace ceres::internal + +#endif // CERES_NO_CUDA +#endif // CERES_INTERNAL_CUDA_SPARSE_MATRIX_H_
diff --git a/internal/ceres/cuda_sparse_matrix_test.cc b/internal/ceres/cuda_sparse_matrix_test.cc new file mode 100644 index 0000000..3a01ff9 --- /dev/null +++ b/internal/ceres/cuda_sparse_matrix_test.cc
@@ -0,0 +1,315 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2022 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Author: joydeepb@cs.utexas.edu (Joydeep Biswas) + +#include <string> + +#include "ceres/casts.h" +#include "ceres/internal/config.h" +#include "ceres/internal/eigen.h" +#include "ceres/linear_least_squares_problems.h" +#include "ceres/block_sparse_matrix.h" +#include "ceres/cuda_vector.h" +#include "ceres/cuda_sparse_matrix.h" +#include "ceres/triplet_sparse_matrix.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +namespace ceres { +namespace internal { + +#ifndef CERES_NO_CUDA + +class CudaSparseMatrixTest : public ::testing::Test { + protected: + void SetUp() final { + std::string message; + CHECK(context_.InitCUDA(&message)) + << "InitCUDA() failed because: " << message; + std::unique_ptr<LinearLeastSquaresProblem> problem = + CreateLinearLeastSquaresProblemFromId(2); + CHECK(problem != nullptr); + A_.reset(down_cast<BlockSparseMatrix*>(problem->A.release())); + CHECK(A_ != nullptr); + CHECK(problem->b != nullptr); + CHECK(problem->x != nullptr); + b_.resize(A_->num_rows()); + for (int i = 0; i < A_->num_rows(); ++i) { + b_[i] = problem->b[i]; + } + x_.resize(A_->num_cols()); + for (int i = 0; i < A_->num_cols(); ++i) { + x_[i] = problem->x[i]; + } + CHECK_EQ(A_->num_rows(), b_.rows()); + CHECK_EQ(A_->num_cols(), x_.rows()); + } + + std::unique_ptr<BlockSparseMatrix> A_; + Vector x_; + Vector b_; + ContextImpl context_; +}; + +TEST_F(CudaSparseMatrixTest, RightMultiplyAndAccumulate) { + std::string message; + CompressedRowSparseMatrix A_crs( + A_->num_rows(), A_->num_cols(), A_->num_nonzeros()); + A_->ToCompressedRowSparseMatrix(&A_crs); + CudaSparseMatrix A_gpu(&context_, A_crs); + CudaVector x_gpu(&context_, A_gpu.num_cols()); + CudaVector res_gpu(&context_, A_gpu.num_rows()); + x_gpu.CopyFromCpu(x_); + + const Vector minus_b = -b_; + // res = -b + res_gpu.CopyFromCpu(minus_b); + // res += A * x + A_gpu.RightMultiplyAndAccumulate(x_gpu, &res_gpu); + + Vector res; + res_gpu.CopyTo(&res); + + Vector res_expected = minus_b; + A_->RightMultiplyAndAccumulate(x_.data(), res_expected.data()); + + EXPECT_LE((res - res_expected).norm(), + std::numeric_limits<double>::epsilon() * 1e3); +} + +TEST(CudaSparseMatrix, CopyValuesFromCpu) { + // A1: + // [ 1 1 0 0 + // 0 1 1 0] + // A2: + // [ 1 2 0 0 + // 0 3 4 0] + // b: [1 2 3 4]' + // A1 * b = [3 5]' + // A2 * b = [5 18]' + TripletSparseMatrix A1( + 2, + 4, + {0, 0, 1, 1}, + {0, 1, 1, 2}, + {1, 1, 1, 1} + ); + TripletSparseMatrix A2( + 2, + 4, + {0, 0, 1, 1}, + {0, 1, 1, 2}, + {1, 2, 3, 4} + ); + Vector b(4); + b << 1, 2, 3, 4; + + + ContextImpl context; + std::string message; + CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message; + auto A1_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A1); + CudaSparseMatrix A_gpu(&context, *A1_crs); + CudaVector b_gpu(&context, A1.num_cols()); + CudaVector x_gpu(&context, A1.num_rows()); + b_gpu.CopyFromCpu(b); + x_gpu.SetZero(); + + Vector x_expected(2); + x_expected << 3, 5; + A_gpu.RightMultiplyAndAccumulate(b_gpu, &x_gpu); + Vector x_computed; + x_gpu.CopyTo(&x_computed); + EXPECT_EQ(x_computed, x_expected); + + auto A2_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A2); + A_gpu.CopyValuesFromCpu(*A2_crs); + x_gpu.SetZero(); + x_expected << 5, 18; + A_gpu.RightMultiplyAndAccumulate(b_gpu, &x_gpu); + x_gpu.CopyTo(&x_computed); + EXPECT_EQ(x_computed, x_expected); +} + +TEST(CudaSparseMatrix, RightMultiplyAndAccumulate) { + // A: + // [ 1 2 0 0 + // 0 3 4 0] + // b: [1 2 3 4]' + // A * b = [5 18]' + TripletSparseMatrix A( + 2, + 4, + {0, 0, 1, 1}, + {0, 1, 1, 2}, + {1, 2, 3, 4} + ); + Vector b(4); + b << 1, 2, 3, 4; + Vector x_expected(2); + x_expected << 5, 18; + + ContextImpl context; + std::string message; + CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message; + auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A); + CudaSparseMatrix A_gpu(&context, *A_crs); + CudaVector b_gpu(&context, A.num_cols()); + CudaVector x_gpu(&context, A.num_rows()); + b_gpu.CopyFromCpu(b); + x_gpu.SetZero(); + + A_gpu.RightMultiplyAndAccumulate(b_gpu, &x_gpu); + + Vector x_computed; + x_gpu.CopyTo(&x_computed); + + EXPECT_EQ(x_computed, x_expected); +} + +TEST(CudaSparseMatrix, LeftMultiplyAndAccumulate) { + // A: + // [ 1 2 0 0 + // 0 3 4 0] + // b: [1 2]' + // A'* b = [1 8 8 0]' + TripletSparseMatrix A( + 2, + 4, + {0, 0, 1, 1}, + {0, 1, 1, 2}, + {1, 2, 3, 4} + ); + Vector b(2); + b << 1, 2; + Vector x_expected(4); + x_expected << 1, 8, 8, 0; + + ContextImpl context; + std::string message; + CHECK(context.InitCUDA(&message)) + << "InitCUDA() failed because: " << message; + auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A); + CudaSparseMatrix A_gpu(&context, *A_crs); + CudaVector b_gpu(&context, A.num_rows()); + CudaVector x_gpu(&context, A.num_cols()); + b_gpu.CopyFromCpu(b); + x_gpu.SetZero(); + + A_gpu.LeftMultiplyAndAccumulate(b_gpu, &x_gpu); + + Vector x_computed; + x_gpu.CopyTo(&x_computed); + + EXPECT_EQ(x_computed, x_expected); +} + +// If there are numerical errors due to synchronization issues, they will show +// up when testing with large matrices, since each operation will take +// significant time, thus hopefully revealing any potential synchronization +// issues. +TEST(CudaSparseMatrix, LargeMultiplyAndAccumulate) { + // Create a large NxN matrix A that has the following structure: + // In row i, only columns i and i+1 are non-zero. + // A_{i, i} = A_{i, i+1} = 1. + // There will be 2 * N - 1 non-zero elements in A. + // X = [1:N] + // Right multiply test: + // b = A * X + // Left multiply test: + // b = A' * X + + const int N = 10 * 1000 * 1000; + const int num_non_zeros = 2 * N - 1; + std::vector<int> row_indices(num_non_zeros); + std::vector<int> col_indices(num_non_zeros); + std::vector<double> values(num_non_zeros); + + for (int i = 0; i < N; ++i) { + row_indices[2 * i] = i; + col_indices[2 * i] = i; + values[2 * i] = 1.0; + if (i + 1 < N) { + col_indices[2 * i + 1] = i + 1; + row_indices[2 * i + 1] = i; + values[2 * i + 1] = 1; + } + } + TripletSparseMatrix A(N, N, row_indices, col_indices, values); + Vector x(N); + for (int i = 0; i < N; ++i) { + x[i] = i + 1; + } + + ContextImpl context; + std::string message; + CHECK(context.InitCUDA(&message)) + << "InitCUDA() failed because: " << message; + auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A); + CudaSparseMatrix A_gpu(&context, *A_crs); + CudaVector b_gpu(&context, N); + CudaVector x_gpu(&context, N); + x_gpu.CopyFromCpu(x); + + + // First check RightMultiply. + { + b_gpu.SetZero(); + A_gpu.RightMultiplyAndAccumulate(x_gpu, &b_gpu); + Vector b_computed; + b_gpu.CopyTo(&b_computed); + for (int i = 0; i < N; ++i) { + if (i + 1 < N) { + EXPECT_EQ(b_computed[i], 2 * (i + 1) + 1); + } else { + EXPECT_EQ(b_computed[i], i + 1); + } + } + } + + // Next check LeftMultiply. + { + b_gpu.SetZero(); + A_gpu.LeftMultiplyAndAccumulate(x_gpu, &b_gpu); + Vector b_computed; + b_gpu.CopyTo(&b_computed); + for (int i = 0; i < N; ++i) { + if (i > 0) { + EXPECT_EQ(b_computed[i], 2 * (i + 1) - 1); + } else { + EXPECT_EQ(b_computed[i], i + 1); + } + } + } +} + +#endif // CERES_NO_CUDA + +} // namespace internal +} // namespace ceres
diff --git a/internal/ceres/sparse_linear_operator_benchmark.cc b/internal/ceres/sparse_linear_operator_benchmark.cc new file mode 100644 index 0000000..c224e35 --- /dev/null +++ b/internal/ceres/sparse_linear_operator_benchmark.cc
@@ -0,0 +1,239 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2022 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Authors: joydeepb@cs.utexas.edu (Joydeep Biswas) + +#include <iostream> +#include <memory> +#include <random> +#include <string> + +#include "Eigen/Dense" +#include "gflags/gflags.h" +#include "benchmark/benchmark.h" +#include "ceres/context_impl.h" +#include "ceres/block_sparse_matrix.h" +#include "ceres/cuda_sparse_matrix.h" +#include "ceres/cuda_vector.h" +#include "ceres/internal/config.h" +#include "ceres/internal/eigen.h" +#include "ceres/linear_solver.h" +#include "cuda_runtime.h" + +namespace ceres::internal { + +// TODO(Joydeep Biswas): Add a matrix of benchmark sizes to test. + +namespace { +// Generate a synthetic BA-style Jacobian with n camera poses, m landmarks, n_d +// parameters per camera, m_d parameters per landmark, and k residuals per +// camera. +// TODO: Unify the synthetic Jacobian generation code with the code from +// schur_eliminator_benchmark.cc since they are very similar. +std::unique_ptr<BlockSparseMatrix> GenerateSyntheticJacobian( + int n, int m, int n_d, int m_d, int k) { + static const int kResidualSize = 2; + CompressedRowBlockStructure* bs = new CompressedRowBlockStructure; + int c = 0; + // Add column blocks for each camera. + for (int i = 0; i < n; ++i) { + bs->cols.push_back(Block(n_d, c)); + c += n_d; + } + // Add column blocks for each landmark. + for (int i = 0; i < m; ++i) { + bs->cols.push_back(Block(m_d, c)); + c += m_d; + } + // Total number of row blocks = k * n. + bs->rows.resize(k * n); + int values_offset = 0; + std::mt19937 prng; + std::uniform_real_distribution uniform_0_m(0.0, static_cast<double>(m)); + // Generate structure of the Jacobian. + // For n cameras: + for (int i = 0; i < n; ++i) { + const int camera_block_id = i; + // For k residuals per camera: + for (int j = 0; j < k; ++j) { + // Pick the landmark of the residual randomly from [0, m). + const int landmark_id = uniform_0_m(prng); + const int landmark_block_id = n + landmark_id; + const int row_idx = i * k + j; + const int row = kResidualSize * row_idx; + bs->rows[row_idx].block = Block(kResidualSize, row); + bs->rows[row_idx].cells.resize(2); + // The camera part of the jacobian of this residual. + bs->rows[row_idx].cells[0] = Cell(camera_block_id, values_offset); + values_offset += n_d * kResidualSize; + // The landmark part of the jacobian of this residual. + bs->rows[row_idx].cells[1] = Cell(landmark_block_id, values_offset); + values_offset += m_d * kResidualSize; + } + } + std::unique_ptr<BlockSparseMatrix> jacobian = + std::make_unique<BlockSparseMatrix>(bs); + VectorRef(jacobian->mutable_values(), jacobian->num_nonzeros()).setRandom(); + return jacobian; +} +} // namespace + +DEFINE_int32(num_cameras, 1000, "Number of cameras."); +DEFINE_int32(num_landmarks, 10000, "Number of landmarks."); +DEFINE_int32(num_parameters_per_camera, 6, "Number of parameters per camera."); +DEFINE_int32(num_parameters_per_landmark, + 3, + "Number of parameters per landmark."); +DEFINE_int32(num_residuals_per_camera, 100, "Number of residuals per camera."); + +static void BM_CpuRightMultiplyAndAccumulate(benchmark::State& state) { + // Perform setup here + std::unique_ptr<BlockSparseMatrix> jacobian = + GenerateSyntheticJacobian(FLAGS_num_cameras, + FLAGS_num_landmarks, + FLAGS_num_parameters_per_camera, + FLAGS_num_parameters_per_landmark, + FLAGS_num_residuals_per_camera); + Vector x(jacobian->num_cols()); + Vector y(jacobian->num_rows()); + x.setRandom(); + y.setRandom(); + double sum = 0; + for (auto _ : state) { + // This code gets timed + jacobian->RightMultiplyAndAccumulate(x.data(), y.data()); + sum += y.norm(); + } + CHECK_NE(sum, 0.0); +} + +static void BM_CpuLeftMultiplyAndAccumulate(benchmark::State& state) { + // Perform setup here + std::unique_ptr<BlockSparseMatrix> jacobian = + GenerateSyntheticJacobian(FLAGS_num_cameras, + FLAGS_num_landmarks, + FLAGS_num_parameters_per_camera, + FLAGS_num_parameters_per_landmark, + FLAGS_num_residuals_per_camera); + Vector x(jacobian->num_rows()); + Vector y(jacobian->num_cols()); + x.setRandom(); + y.setRandom(); + double sum = 0; + for (auto _ : state) { + // This code gets timed + jacobian->LeftMultiplyAndAccumulate(x.data(), y.data()); + sum += y.norm(); + } + CHECK_NE(sum, 0.0); +} + +static void BM_CudaRightMultiplyAndAccumulate(benchmark::State& state) { + // Perform setup here + std::unique_ptr<BlockSparseMatrix> jacobian = + GenerateSyntheticJacobian(FLAGS_num_cameras, + FLAGS_num_landmarks, + FLAGS_num_parameters_per_camera, + FLAGS_num_parameters_per_landmark, + FLAGS_num_residuals_per_camera); + ContextImpl context; + std::string message; + context.InitCUDA(&message); + CompressedRowSparseMatrix jacobian_crs( + jacobian->num_rows(), + jacobian->num_cols(), + jacobian->num_nonzeros()); + jacobian->ToCompressedRowSparseMatrix(&jacobian_crs); + CudaSparseMatrix cuda_jacobian(&context, jacobian_crs); + CudaVector cuda_x(&context, 0); + CudaVector cuda_y(&context, 0); + + Vector x(jacobian->num_cols()); + Vector y(jacobian->num_rows()); + x.setRandom(); + y.setRandom(); + + cuda_x.CopyFromCpu(x); + cuda_y.CopyFromCpu(y); + double sum = 0; + for (auto _ : state) { + // This code gets timed + cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y); + sum += cuda_y.Norm(); + CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess); + } + CHECK_NE(sum, 0.0); +} + +static void BM_CudaLeftMultiplyAndAccumulate(benchmark::State& state) { + // Perform setup here + std::unique_ptr<BlockSparseMatrix> jacobian = + GenerateSyntheticJacobian(FLAGS_num_cameras, + FLAGS_num_landmarks, + FLAGS_num_parameters_per_camera, + FLAGS_num_parameters_per_landmark, + FLAGS_num_residuals_per_camera); + ContextImpl context; + std::string message; + context.InitCUDA(&message); + CompressedRowSparseMatrix jacobian_crs( + jacobian->num_rows(), + jacobian->num_cols(), + jacobian->num_nonzeros()); + jacobian->ToCompressedRowSparseMatrix(&jacobian_crs); + CudaSparseMatrix cuda_jacobian(&context, jacobian_crs); + CudaVector cuda_x(&context, 0); + CudaVector cuda_y(&context, 0); + + Vector x(jacobian->num_rows()); + Vector y(jacobian->num_cols()); + x.setRandom(); + y.setRandom(); + + cuda_x.CopyFromCpu(x); + cuda_y.CopyFromCpu(y); + double sum = 0; + for (auto _ : state) { + // This code gets timed + cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y); + sum += cuda_y.Norm(); + CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess); + } + CHECK_NE(sum, 0.0); +} + +BENCHMARK(BM_CpuRightMultiplyAndAccumulate); +BENCHMARK(BM_CpuLeftMultiplyAndAccumulate); +BENCHMARK(BM_CudaRightMultiplyAndAccumulate); +BENCHMARK(BM_CudaLeftMultiplyAndAccumulate); + +BENCHMARK_MAIN(); + +} // namespace ceres::internal + +BENCHMARK_MAIN();