CUDA CGNR, Part 3: CudaSparseMatrix
* Added CudaSparseMatrix to manage and operate on sparse matrices with
cuSparse.
* Added tests for CudaSparseMatrix.
* Added a new sparse linear operator benchmark.
Change-Id: Id09df46de3b40be1f14441528088b54dab5844af
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 02916b1..dae16b5 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -198,6 +198,7 @@
cost_function.cc
covariance.cc
covariance_impl.cc
+ cuda_sparse_matrix.cc
cuda_vector.cc
dense_cholesky.cc
dense_normal_cholesky_solver.cc
@@ -484,6 +485,7 @@
ceres_test(cuda_dense_cholesky)
ceres_test(cuda_dense_qr)
ceres_test(cuda_kernels)
+ ceres_test(cuda_sparse_matrix)
ceres_test(cuda_vector)
ceres_test(dense_linear_solver)
ceres_test(dense_cholesky)
@@ -589,5 +591,9 @@
add_executable(dense_linear_solver_benchmark dense_linear_solver_benchmark.cc)
add_dependencies_to_benchmark(dense_linear_solver_benchmark)
+ add_executable(sparse_linear_operator_benchmark
+ sparse_linear_operator_benchmark.cc)
+ add_dependencies_to_benchmark(sparse_linear_operator_benchmark)
+
add_subdirectory(autodiff_benchmarks)
endif (BUILD_BENCHMARKS)
diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc
new file mode 100644
index 0000000..2f857e2
--- /dev/null
+++ b/internal/ceres/cuda_sparse_matrix.cc
@@ -0,0 +1,152 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+//
+// A CUDA sparse matrix linear operator.
+
+// This include must come before any #ifndef check on Ceres compile options.
+// clang-format off
+#include "ceres/internal/config.h"
+// clang-format on
+
+#include "ceres/cuda_sparse_matrix.h"
+
+#include <math.h>
+#include <memory>
+
+#include "ceres/internal/export.h"
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/compressed_row_sparse_matrix.h"
+#include "ceres/crs_matrix.h"
+#include "ceres/types.h"
+#include "ceres/context_impl.h"
+#include "ceres/wall_time.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "ceres/cuda_buffer.h"
+#include "ceres/cuda_vector.h"
+#include "ceres/ceres_cuda_kernels.h"
+#include "cusparse.h"
+
+
+namespace ceres::internal {
+
+CudaSparseMatrix::CudaSparseMatrix(
+ ContextImpl* context,
+ const CompressedRowSparseMatrix& crs_matrix) {
+ DCHECK_NE(context, nullptr);
+ CHECK(context->IsCUDAInitialized());
+ context_ = context;
+ num_rows_ = crs_matrix.num_rows();
+ num_cols_ = crs_matrix.num_cols();
+ num_nonzeros_ = crs_matrix.num_nonzeros();
+ rows_.CopyFromCpu(
+ crs_matrix.rows(), num_rows_ + 1, context_->stream_);
+ cols_.CopyFromCpu(
+ crs_matrix.cols(), num_nonzeros_, context_->stream_);
+ values_.CopyFromCpu(
+ crs_matrix.values(), num_nonzeros_, context_->stream_);
+ cusparseCreateCsr(&descr_,
+ num_rows_,
+ num_cols_,
+ num_nonzeros_,
+ rows_.data(),
+ cols_.data(),
+ values_.data(),
+ CUSPARSE_INDEX_32I,
+ CUSPARSE_INDEX_32I,
+ CUSPARSE_INDEX_BASE_ZERO,
+ CUDA_R_64F);
+}
+
+CudaSparseMatrix::~CudaSparseMatrix() {
+ CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS);
+ descr_ = nullptr;
+}
+
+void CudaSparseMatrix::CopyValuesFromCpu(
+ const CompressedRowSparseMatrix& crs_matrix) {
+ // There is no quick and easy way to verify that the structure is unchanged,
+ // but at least we can check that the size of the matrix and the number of
+ // nonzeros is unchanged.
+ CHECK_EQ(num_rows_, crs_matrix.num_rows());
+ CHECK_EQ(num_cols_, crs_matrix.num_cols());
+ CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros());
+ values_.CopyFromCpu(
+ crs_matrix.values(), num_nonzeros_, context_->stream_);
+}
+
+void CudaSparseMatrix::SpMv(cusparseOperation_t op,
+ const CudaVector& x,
+ CudaVector* y) {
+ size_t buffer_size = 0;
+ const double alpha = 1.0;
+ const double beta = 1.0;
+
+ CHECK_EQ(cusparseSpMV_bufferSize(context_->cusparse_handle_,
+ op,
+ &alpha,
+ descr_,
+ x.descr(),
+ &beta,
+ y->descr(),
+ CUDA_R_64F,
+ CUSPARSE_SPMV_ALG_DEFAULT,
+ &buffer_size),
+ CUSPARSE_STATUS_SUCCESS);
+ spmv_buffer_.Reserve(buffer_size);
+ CHECK_EQ(cusparseSpMV(context_->cusparse_handle_,
+ op,
+ &alpha,
+ descr_,
+ x.descr(),
+ &beta,
+ y->descr(),
+ CUDA_R_64F,
+ CUSPARSE_SPMV_ALG_DEFAULT,
+ spmv_buffer_.data()),
+ CUSPARSE_STATUS_SUCCESS);
+}
+
+void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) {
+ SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x, y);
+}
+
+void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) {
+ // TODO(Joydeep Biswas): We should consider storing a transposed copy of the
+ // matrix by converting CSR to CSC. From the cuSPARSE documentation:
+ // "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA
+ // != CUSPARSE_OPERATION_NON_TRANSPOSE"
+ SpMv(CUSPARSE_OPERATION_TRANSPOSE, x, y);
+}
+
+} // namespace ceres::internal
+
+#endif // CERES_NO_CUDA
\ No newline at end of file
diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h
new file mode 100644
index 0000000..7661fb9
--- /dev/null
+++ b/internal/ceres/cuda_sparse_matrix.h
@@ -0,0 +1,120 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+//
+// A CUDA sparse matrix linear operator.
+
+#ifndef CERES_INTERNAL_CUDA_SPARSE_MATRIX_H_
+#define CERES_INTERNAL_CUDA_SPARSE_MATRIX_H_
+
+// This include must come before any #ifndef check on Ceres compile options.
+// clang-format off
+#include "ceres/internal/config.h"
+// clang-format on
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "ceres/compressed_row_sparse_matrix.h"
+#include "ceres/internal/export.h"
+#include "ceres/types.h"
+#include "ceres/context_impl.h"
+
+#ifndef CERES_NO_CUDA
+#include "ceres/cuda_buffer.h"
+#include "ceres/cuda_vector.h"
+#include "cusparse.h"
+
+namespace ceres::internal {
+
+// A sparse matrix hosted on the GPU in compressed row sparse format, with
+// CUDA-accelerated operations.
+class CERES_NO_EXPORT CudaSparseMatrix {
+ public:
+
+ // Create a GPU copy of the matrix provided. The caller must ensure that
+ // InitCuda() has already been successfully called on context before calling
+ // this constructor.
+ CudaSparseMatrix(ContextImpl* context,
+ const CompressedRowSparseMatrix& crs_matrix);
+
+ ~CudaSparseMatrix();
+
+ // y = y + Ax;
+ void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y);
+ // y = y + A'x;
+ void LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y);
+
+ int num_rows() const { return num_rows_; }
+ int num_cols() const { return num_cols_; }
+ int num_nonzeros() const { return num_nonzeros_; }
+
+ // If subsequent uses of this matrix involve only numerical changes and no
+ // structural changes, then this method can be used to copy the updated
+ // non-zero values -- the row and column index arrays are kept the same. It
+ // is the caller's responsibility to ensure that the sparsity structure of the
+ // matrix is unchanged.
+ void CopyValuesFromCpu(const CompressedRowSparseMatrix& crs_matrix);
+
+ const cusparseSpMatDescr_t& descr() const { return descr_; }
+
+ private:
+
+ // Disable copy and assignment.
+ CudaSparseMatrix(const CudaSparseMatrix&) = delete;
+ CudaSparseMatrix& operator=(const CudaSparseMatrix&) = delete;
+
+ // y = y + op(M)x. op must be either CUSPARSE_OPERATION_NON_TRANSPOSE or
+ // CUSPARSE_OPERATION_TRANSPOSE.
+ void SpMv(cusparseOperation_t op, const CudaVector& x, CudaVector* y);
+
+ int num_rows_ = 0;
+ int num_cols_ = 0;
+ int num_nonzeros_ = 0;
+
+ // CSR row indices.
+ CudaBuffer<int32_t> rows_;
+ // CSR column indices.
+ CudaBuffer<int32_t> cols_;
+ // CSR values.
+ CudaBuffer<double> values_;
+
+ ContextImpl* context_ = nullptr;
+
+ // CuSparse object that describes this matrix.
+ cusparseSpMatDescr_t descr_ = nullptr;
+
+ CudaBuffer<uint8_t> spmv_buffer_;
+};
+
+} // namespace ceres::internal
+
+#endif // CERES_NO_CUDA
+#endif // CERES_INTERNAL_CUDA_SPARSE_MATRIX_H_
diff --git a/internal/ceres/cuda_sparse_matrix_test.cc b/internal/ceres/cuda_sparse_matrix_test.cc
new file mode 100644
index 0000000..3a01ff9
--- /dev/null
+++ b/internal/ceres/cuda_sparse_matrix_test.cc
@@ -0,0 +1,315 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include <string>
+
+#include "ceres/casts.h"
+#include "ceres/internal/config.h"
+#include "ceres/internal/eigen.h"
+#include "ceres/linear_least_squares_problems.h"
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/cuda_vector.h"
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/triplet_sparse_matrix.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace ceres {
+namespace internal {
+
+#ifndef CERES_NO_CUDA
+
+class CudaSparseMatrixTest : public ::testing::Test {
+ protected:
+ void SetUp() final {
+ std::string message;
+ CHECK(context_.InitCUDA(&message))
+ << "InitCUDA() failed because: " << message;
+ std::unique_ptr<LinearLeastSquaresProblem> problem =
+ CreateLinearLeastSquaresProblemFromId(2);
+ CHECK(problem != nullptr);
+ A_.reset(down_cast<BlockSparseMatrix*>(problem->A.release()));
+ CHECK(A_ != nullptr);
+ CHECK(problem->b != nullptr);
+ CHECK(problem->x != nullptr);
+ b_.resize(A_->num_rows());
+ for (int i = 0; i < A_->num_rows(); ++i) {
+ b_[i] = problem->b[i];
+ }
+ x_.resize(A_->num_cols());
+ for (int i = 0; i < A_->num_cols(); ++i) {
+ x_[i] = problem->x[i];
+ }
+ CHECK_EQ(A_->num_rows(), b_.rows());
+ CHECK_EQ(A_->num_cols(), x_.rows());
+ }
+
+ std::unique_ptr<BlockSparseMatrix> A_;
+ Vector x_;
+ Vector b_;
+ ContextImpl context_;
+};
+
+TEST_F(CudaSparseMatrixTest, RightMultiplyAndAccumulate) {
+ std::string message;
+ CompressedRowSparseMatrix A_crs(
+ A_->num_rows(), A_->num_cols(), A_->num_nonzeros());
+ A_->ToCompressedRowSparseMatrix(&A_crs);
+ CudaSparseMatrix A_gpu(&context_, A_crs);
+ CudaVector x_gpu(&context_, A_gpu.num_cols());
+ CudaVector res_gpu(&context_, A_gpu.num_rows());
+ x_gpu.CopyFromCpu(x_);
+
+ const Vector minus_b = -b_;
+ // res = -b
+ res_gpu.CopyFromCpu(minus_b);
+ // res += A * x
+ A_gpu.RightMultiplyAndAccumulate(x_gpu, &res_gpu);
+
+ Vector res;
+ res_gpu.CopyTo(&res);
+
+ Vector res_expected = minus_b;
+ A_->RightMultiplyAndAccumulate(x_.data(), res_expected.data());
+
+ EXPECT_LE((res - res_expected).norm(),
+ std::numeric_limits<double>::epsilon() * 1e3);
+}
+
+TEST(CudaSparseMatrix, CopyValuesFromCpu) {
+ // A1:
+ // [ 1 1 0 0
+ // 0 1 1 0]
+ // A2:
+ // [ 1 2 0 0
+ // 0 3 4 0]
+ // b: [1 2 3 4]'
+ // A1 * b = [3 5]'
+ // A2 * b = [5 18]'
+ TripletSparseMatrix A1(
+ 2,
+ 4,
+ {0, 0, 1, 1},
+ {0, 1, 1, 2},
+ {1, 1, 1, 1}
+ );
+ TripletSparseMatrix A2(
+ 2,
+ 4,
+ {0, 0, 1, 1},
+ {0, 1, 1, 2},
+ {1, 2, 3, 4}
+ );
+ Vector b(4);
+ b << 1, 2, 3, 4;
+
+
+ ContextImpl context;
+ std::string message;
+ CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+ auto A1_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A1);
+ CudaSparseMatrix A_gpu(&context, *A1_crs);
+ CudaVector b_gpu(&context, A1.num_cols());
+ CudaVector x_gpu(&context, A1.num_rows());
+ b_gpu.CopyFromCpu(b);
+ x_gpu.SetZero();
+
+ Vector x_expected(2);
+ x_expected << 3, 5;
+ A_gpu.RightMultiplyAndAccumulate(b_gpu, &x_gpu);
+ Vector x_computed;
+ x_gpu.CopyTo(&x_computed);
+ EXPECT_EQ(x_computed, x_expected);
+
+ auto A2_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A2);
+ A_gpu.CopyValuesFromCpu(*A2_crs);
+ x_gpu.SetZero();
+ x_expected << 5, 18;
+ A_gpu.RightMultiplyAndAccumulate(b_gpu, &x_gpu);
+ x_gpu.CopyTo(&x_computed);
+ EXPECT_EQ(x_computed, x_expected);
+}
+
+TEST(CudaSparseMatrix, RightMultiplyAndAccumulate) {
+ // A:
+ // [ 1 2 0 0
+ // 0 3 4 0]
+ // b: [1 2 3 4]'
+ // A * b = [5 18]'
+ TripletSparseMatrix A(
+ 2,
+ 4,
+ {0, 0, 1, 1},
+ {0, 1, 1, 2},
+ {1, 2, 3, 4}
+ );
+ Vector b(4);
+ b << 1, 2, 3, 4;
+ Vector x_expected(2);
+ x_expected << 5, 18;
+
+ ContextImpl context;
+ std::string message;
+ CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+ auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A);
+ CudaSparseMatrix A_gpu(&context, *A_crs);
+ CudaVector b_gpu(&context, A.num_cols());
+ CudaVector x_gpu(&context, A.num_rows());
+ b_gpu.CopyFromCpu(b);
+ x_gpu.SetZero();
+
+ A_gpu.RightMultiplyAndAccumulate(b_gpu, &x_gpu);
+
+ Vector x_computed;
+ x_gpu.CopyTo(&x_computed);
+
+ EXPECT_EQ(x_computed, x_expected);
+}
+
+TEST(CudaSparseMatrix, LeftMultiplyAndAccumulate) {
+ // A:
+ // [ 1 2 0 0
+ // 0 3 4 0]
+ // b: [1 2]'
+ // A'* b = [1 8 8 0]'
+ TripletSparseMatrix A(
+ 2,
+ 4,
+ {0, 0, 1, 1},
+ {0, 1, 1, 2},
+ {1, 2, 3, 4}
+ );
+ Vector b(2);
+ b << 1, 2;
+ Vector x_expected(4);
+ x_expected << 1, 8, 8, 0;
+
+ ContextImpl context;
+ std::string message;
+ CHECK(context.InitCUDA(&message))
+ << "InitCUDA() failed because: " << message;
+ auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A);
+ CudaSparseMatrix A_gpu(&context, *A_crs);
+ CudaVector b_gpu(&context, A.num_rows());
+ CudaVector x_gpu(&context, A.num_cols());
+ b_gpu.CopyFromCpu(b);
+ x_gpu.SetZero();
+
+ A_gpu.LeftMultiplyAndAccumulate(b_gpu, &x_gpu);
+
+ Vector x_computed;
+ x_gpu.CopyTo(&x_computed);
+
+ EXPECT_EQ(x_computed, x_expected);
+}
+
+// If there are numerical errors due to synchronization issues, they will show
+// up when testing with large matrices, since each operation will take
+// significant time, thus hopefully revealing any potential synchronization
+// issues.
+TEST(CudaSparseMatrix, LargeMultiplyAndAccumulate) {
+ // Create a large NxN matrix A that has the following structure:
+ // In row i, only columns i and i+1 are non-zero.
+ // A_{i, i} = A_{i, i+1} = 1.
+ // There will be 2 * N - 1 non-zero elements in A.
+ // X = [1:N]
+ // Right multiply test:
+ // b = A * X
+ // Left multiply test:
+ // b = A' * X
+
+ const int N = 10 * 1000 * 1000;
+ const int num_non_zeros = 2 * N - 1;
+ std::vector<int> row_indices(num_non_zeros);
+ std::vector<int> col_indices(num_non_zeros);
+ std::vector<double> values(num_non_zeros);
+
+ for (int i = 0; i < N; ++i) {
+ row_indices[2 * i] = i;
+ col_indices[2 * i] = i;
+ values[2 * i] = 1.0;
+ if (i + 1 < N) {
+ col_indices[2 * i + 1] = i + 1;
+ row_indices[2 * i + 1] = i;
+ values[2 * i + 1] = 1;
+ }
+ }
+ TripletSparseMatrix A(N, N, row_indices, col_indices, values);
+ Vector x(N);
+ for (int i = 0; i < N; ++i) {
+ x[i] = i + 1;
+ }
+
+ ContextImpl context;
+ std::string message;
+ CHECK(context.InitCUDA(&message))
+ << "InitCUDA() failed because: " << message;
+ auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A);
+ CudaSparseMatrix A_gpu(&context, *A_crs);
+ CudaVector b_gpu(&context, N);
+ CudaVector x_gpu(&context, N);
+ x_gpu.CopyFromCpu(x);
+
+
+ // First check RightMultiply.
+ {
+ b_gpu.SetZero();
+ A_gpu.RightMultiplyAndAccumulate(x_gpu, &b_gpu);
+ Vector b_computed;
+ b_gpu.CopyTo(&b_computed);
+ for (int i = 0; i < N; ++i) {
+ if (i + 1 < N) {
+ EXPECT_EQ(b_computed[i], 2 * (i + 1) + 1);
+ } else {
+ EXPECT_EQ(b_computed[i], i + 1);
+ }
+ }
+ }
+
+ // Next check LeftMultiply.
+ {
+ b_gpu.SetZero();
+ A_gpu.LeftMultiplyAndAccumulate(x_gpu, &b_gpu);
+ Vector b_computed;
+ b_gpu.CopyTo(&b_computed);
+ for (int i = 0; i < N; ++i) {
+ if (i > 0) {
+ EXPECT_EQ(b_computed[i], 2 * (i + 1) - 1);
+ } else {
+ EXPECT_EQ(b_computed[i], i + 1);
+ }
+ }
+ }
+}
+
+#endif // CERES_NO_CUDA
+
+} // namespace internal
+} // namespace ceres
diff --git a/internal/ceres/sparse_linear_operator_benchmark.cc b/internal/ceres/sparse_linear_operator_benchmark.cc
new file mode 100644
index 0000000..c224e35
--- /dev/null
+++ b/internal/ceres/sparse_linear_operator_benchmark.cc
@@ -0,0 +1,239 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include <iostream>
+#include <memory>
+#include <random>
+#include <string>
+
+#include "Eigen/Dense"
+#include "gflags/gflags.h"
+#include "benchmark/benchmark.h"
+#include "ceres/context_impl.h"
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/cuda_vector.h"
+#include "ceres/internal/config.h"
+#include "ceres/internal/eigen.h"
+#include "ceres/linear_solver.h"
+#include "cuda_runtime.h"
+
+namespace ceres::internal {
+
+// TODO(Joydeep Biswas): Add a matrix of benchmark sizes to test.
+
+namespace {
+// Generate a synthetic BA-style Jacobian with n camera poses, m landmarks, n_d
+// parameters per camera, m_d parameters per landmark, and k residuals per
+// camera.
+// TODO: Unify the synthetic Jacobian generation code with the code from
+// schur_eliminator_benchmark.cc since they are very similar.
+std::unique_ptr<BlockSparseMatrix> GenerateSyntheticJacobian(
+ int n, int m, int n_d, int m_d, int k) {
+ static const int kResidualSize = 2;
+ CompressedRowBlockStructure* bs = new CompressedRowBlockStructure;
+ int c = 0;
+ // Add column blocks for each camera.
+ for (int i = 0; i < n; ++i) {
+ bs->cols.push_back(Block(n_d, c));
+ c += n_d;
+ }
+ // Add column blocks for each landmark.
+ for (int i = 0; i < m; ++i) {
+ bs->cols.push_back(Block(m_d, c));
+ c += m_d;
+ }
+ // Total number of row blocks = k * n.
+ bs->rows.resize(k * n);
+ int values_offset = 0;
+ std::mt19937 prng;
+ std::uniform_real_distribution uniform_0_m(0.0, static_cast<double>(m));
+ // Generate structure of the Jacobian.
+ // For n cameras:
+ for (int i = 0; i < n; ++i) {
+ const int camera_block_id = i;
+ // For k residuals per camera:
+ for (int j = 0; j < k; ++j) {
+ // Pick the landmark of the residual randomly from [0, m).
+ const int landmark_id = uniform_0_m(prng);
+ const int landmark_block_id = n + landmark_id;
+ const int row_idx = i * k + j;
+ const int row = kResidualSize * row_idx;
+ bs->rows[row_idx].block = Block(kResidualSize, row);
+ bs->rows[row_idx].cells.resize(2);
+ // The camera part of the jacobian of this residual.
+ bs->rows[row_idx].cells[0] = Cell(camera_block_id, values_offset);
+ values_offset += n_d * kResidualSize;
+ // The landmark part of the jacobian of this residual.
+ bs->rows[row_idx].cells[1] = Cell(landmark_block_id, values_offset);
+ values_offset += m_d * kResidualSize;
+ }
+ }
+ std::unique_ptr<BlockSparseMatrix> jacobian =
+ std::make_unique<BlockSparseMatrix>(bs);
+ VectorRef(jacobian->mutable_values(), jacobian->num_nonzeros()).setRandom();
+ return jacobian;
+}
+} // namespace
+
+DEFINE_int32(num_cameras, 1000, "Number of cameras.");
+DEFINE_int32(num_landmarks, 10000, "Number of landmarks.");
+DEFINE_int32(num_parameters_per_camera, 6, "Number of parameters per camera.");
+DEFINE_int32(num_parameters_per_landmark,
+ 3,
+ "Number of parameters per landmark.");
+DEFINE_int32(num_residuals_per_camera, 100, "Number of residuals per camera.");
+
+static void BM_CpuRightMultiplyAndAccumulate(benchmark::State& state) {
+ // Perform setup here
+ std::unique_ptr<BlockSparseMatrix> jacobian =
+ GenerateSyntheticJacobian(FLAGS_num_cameras,
+ FLAGS_num_landmarks,
+ FLAGS_num_parameters_per_camera,
+ FLAGS_num_parameters_per_landmark,
+ FLAGS_num_residuals_per_camera);
+ Vector x(jacobian->num_cols());
+ Vector y(jacobian->num_rows());
+ x.setRandom();
+ y.setRandom();
+ double sum = 0;
+ for (auto _ : state) {
+ // This code gets timed
+ jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
+ sum += y.norm();
+ }
+ CHECK_NE(sum, 0.0);
+}
+
+static void BM_CpuLeftMultiplyAndAccumulate(benchmark::State& state) {
+ // Perform setup here
+ std::unique_ptr<BlockSparseMatrix> jacobian =
+ GenerateSyntheticJacobian(FLAGS_num_cameras,
+ FLAGS_num_landmarks,
+ FLAGS_num_parameters_per_camera,
+ FLAGS_num_parameters_per_landmark,
+ FLAGS_num_residuals_per_camera);
+ Vector x(jacobian->num_rows());
+ Vector y(jacobian->num_cols());
+ x.setRandom();
+ y.setRandom();
+ double sum = 0;
+ for (auto _ : state) {
+ // This code gets timed
+ jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
+ sum += y.norm();
+ }
+ CHECK_NE(sum, 0.0);
+}
+
+static void BM_CudaRightMultiplyAndAccumulate(benchmark::State& state) {
+ // Perform setup here
+ std::unique_ptr<BlockSparseMatrix> jacobian =
+ GenerateSyntheticJacobian(FLAGS_num_cameras,
+ FLAGS_num_landmarks,
+ FLAGS_num_parameters_per_camera,
+ FLAGS_num_parameters_per_landmark,
+ FLAGS_num_residuals_per_camera);
+ ContextImpl context;
+ std::string message;
+ context.InitCUDA(&message);
+ CompressedRowSparseMatrix jacobian_crs(
+ jacobian->num_rows(),
+ jacobian->num_cols(),
+ jacobian->num_nonzeros());
+ jacobian->ToCompressedRowSparseMatrix(&jacobian_crs);
+ CudaSparseMatrix cuda_jacobian(&context, jacobian_crs);
+ CudaVector cuda_x(&context, 0);
+ CudaVector cuda_y(&context, 0);
+
+ Vector x(jacobian->num_cols());
+ Vector y(jacobian->num_rows());
+ x.setRandom();
+ y.setRandom();
+
+ cuda_x.CopyFromCpu(x);
+ cuda_y.CopyFromCpu(y);
+ double sum = 0;
+ for (auto _ : state) {
+ // This code gets timed
+ cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y);
+ sum += cuda_y.Norm();
+ CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
+ }
+ CHECK_NE(sum, 0.0);
+}
+
+static void BM_CudaLeftMultiplyAndAccumulate(benchmark::State& state) {
+ // Perform setup here
+ std::unique_ptr<BlockSparseMatrix> jacobian =
+ GenerateSyntheticJacobian(FLAGS_num_cameras,
+ FLAGS_num_landmarks,
+ FLAGS_num_parameters_per_camera,
+ FLAGS_num_parameters_per_landmark,
+ FLAGS_num_residuals_per_camera);
+ ContextImpl context;
+ std::string message;
+ context.InitCUDA(&message);
+ CompressedRowSparseMatrix jacobian_crs(
+ jacobian->num_rows(),
+ jacobian->num_cols(),
+ jacobian->num_nonzeros());
+ jacobian->ToCompressedRowSparseMatrix(&jacobian_crs);
+ CudaSparseMatrix cuda_jacobian(&context, jacobian_crs);
+ CudaVector cuda_x(&context, 0);
+ CudaVector cuda_y(&context, 0);
+
+ Vector x(jacobian->num_rows());
+ Vector y(jacobian->num_cols());
+ x.setRandom();
+ y.setRandom();
+
+ cuda_x.CopyFromCpu(x);
+ cuda_y.CopyFromCpu(y);
+ double sum = 0;
+ for (auto _ : state) {
+ // This code gets timed
+ cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
+ sum += cuda_y.Norm();
+ CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
+ }
+ CHECK_NE(sum, 0.0);
+}
+
+BENCHMARK(BM_CpuRightMultiplyAndAccumulate);
+BENCHMARK(BM_CpuLeftMultiplyAndAccumulate);
+BENCHMARK(BM_CudaRightMultiplyAndAccumulate);
+BENCHMARK(BM_CudaLeftMultiplyAndAccumulate);
+
+BENCHMARK_MAIN();
+
+} // namespace ceres::internal
+
+BENCHMARK_MAIN();