CUDA CGNR, Part 3: CudaSparseMatrix

* Added CudaSparseMatrix to manage and operate on sparse matrices with
  cuSparse.
* Added tests for CudaSparseMatrix.
* Added a new sparse linear operator benchmark.

Change-Id: Id09df46de3b40be1f14441528088b54dab5844af
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 02916b1..dae16b5 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -198,6 +198,7 @@
     cost_function.cc
     covariance.cc
     covariance_impl.cc
+    cuda_sparse_matrix.cc
     cuda_vector.cc
     dense_cholesky.cc
     dense_normal_cholesky_solver.cc
@@ -484,6 +485,7 @@
   ceres_test(cuda_dense_cholesky)
   ceres_test(cuda_dense_qr)
   ceres_test(cuda_kernels)
+  ceres_test(cuda_sparse_matrix)
   ceres_test(cuda_vector)
   ceres_test(dense_linear_solver)
   ceres_test(dense_cholesky)
@@ -589,5 +591,9 @@
   add_executable(dense_linear_solver_benchmark dense_linear_solver_benchmark.cc)
   add_dependencies_to_benchmark(dense_linear_solver_benchmark)
 
+  add_executable(sparse_linear_operator_benchmark
+    sparse_linear_operator_benchmark.cc)
+  add_dependencies_to_benchmark(sparse_linear_operator_benchmark)
+
   add_subdirectory(autodiff_benchmarks)
 endif (BUILD_BENCHMARKS)
diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc
new file mode 100644
index 0000000..2f857e2
--- /dev/null
+++ b/internal/ceres/cuda_sparse_matrix.cc
@@ -0,0 +1,152 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+//
+// A CUDA sparse matrix linear operator.
+
+// This include must come before any #ifndef check on Ceres compile options.
+// clang-format off
+#include "ceres/internal/config.h"
+// clang-format on
+
+#include "ceres/cuda_sparse_matrix.h"
+
+#include <math.h>
+#include <memory>
+
+#include "ceres/internal/export.h"
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/compressed_row_sparse_matrix.h"
+#include "ceres/crs_matrix.h"
+#include "ceres/types.h"
+#include "ceres/context_impl.h"
+#include "ceres/wall_time.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "ceres/cuda_buffer.h"
+#include "ceres/cuda_vector.h"
+#include "ceres/ceres_cuda_kernels.h"
+#include "cusparse.h"
+
+
+namespace ceres::internal {
+
+CudaSparseMatrix::CudaSparseMatrix(
+      ContextImpl* context,
+      const CompressedRowSparseMatrix& crs_matrix) {
+  DCHECK_NE(context, nullptr);
+  CHECK(context->IsCUDAInitialized());
+  context_ = context;
+  num_rows_ = crs_matrix.num_rows();
+  num_cols_ = crs_matrix.num_cols();
+  num_nonzeros_ = crs_matrix.num_nonzeros();
+  rows_.CopyFromCpu(
+      crs_matrix.rows(), num_rows_ + 1, context_->stream_);
+  cols_.CopyFromCpu(
+      crs_matrix.cols(), num_nonzeros_, context_->stream_);
+  values_.CopyFromCpu(
+      crs_matrix.values(), num_nonzeros_, context_->stream_);
+  cusparseCreateCsr(&descr_,
+                    num_rows_,
+                    num_cols_,
+                    num_nonzeros_,
+                    rows_.data(),
+                    cols_.data(),
+                    values_.data(),
+                    CUSPARSE_INDEX_32I,
+                    CUSPARSE_INDEX_32I,
+                    CUSPARSE_INDEX_BASE_ZERO,
+                    CUDA_R_64F);
+}
+
+CudaSparseMatrix::~CudaSparseMatrix() {
+  CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS);
+  descr_ = nullptr;
+}
+
+void CudaSparseMatrix::CopyValuesFromCpu(
+    const CompressedRowSparseMatrix& crs_matrix) {
+  // There is no quick and easy way to verify that the structure is unchanged,
+  // but at least we can check that the size of the matrix and the number of
+  // nonzeros is unchanged.
+  CHECK_EQ(num_rows_, crs_matrix.num_rows());
+  CHECK_EQ(num_cols_, crs_matrix.num_cols());
+  CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros());
+  values_.CopyFromCpu(
+      crs_matrix.values(), num_nonzeros_, context_->stream_);
+}
+
+void CudaSparseMatrix::SpMv(cusparseOperation_t op,
+                            const CudaVector& x,
+                            CudaVector* y) {
+  size_t buffer_size = 0;
+  const double alpha = 1.0;
+  const double beta = 1.0;
+
+  CHECK_EQ(cusparseSpMV_bufferSize(context_->cusparse_handle_,
+                                   op,
+                                   &alpha,
+                                   descr_,
+                                   x.descr(),
+                                   &beta,
+                                   y->descr(),
+                                   CUDA_R_64F,
+                                   CUSPARSE_SPMV_ALG_DEFAULT,
+                                   &buffer_size),
+           CUSPARSE_STATUS_SUCCESS);
+  spmv_buffer_.Reserve(buffer_size);
+  CHECK_EQ(cusparseSpMV(context_->cusparse_handle_,
+                        op,
+                        &alpha,
+                        descr_,
+                        x.descr(),
+                        &beta,
+                        y->descr(),
+                        CUDA_R_64F,
+                        CUSPARSE_SPMV_ALG_DEFAULT,
+                        spmv_buffer_.data()),
+           CUSPARSE_STATUS_SUCCESS);
+}
+
+void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) {
+  SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x, y);
+}
+
+void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) {
+  // TODO(Joydeep Biswas): We should consider storing a transposed copy of the
+  // matrix by converting CSR to CSC. From the cuSPARSE documentation:
+  // "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA
+  // != CUSPARSE_OPERATION_NON_TRANSPOSE"
+  SpMv(CUSPARSE_OPERATION_TRANSPOSE, x, y);
+}
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
\ No newline at end of file
diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h
new file mode 100644
index 0000000..7661fb9
--- /dev/null
+++ b/internal/ceres/cuda_sparse_matrix.h
@@ -0,0 +1,120 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+//
+// A CUDA sparse matrix linear operator.
+
+#ifndef CERES_INTERNAL_CUDA_SPARSE_MATRIX_H_
+#define CERES_INTERNAL_CUDA_SPARSE_MATRIX_H_
+
+// This include must come before any #ifndef check on Ceres compile options.
+// clang-format off
+#include "ceres/internal/config.h"
+// clang-format on
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "ceres/compressed_row_sparse_matrix.h"
+#include "ceres/internal/export.h"
+#include "ceres/types.h"
+#include "ceres/context_impl.h"
+
+#ifndef CERES_NO_CUDA
+#include "ceres/cuda_buffer.h"
+#include "ceres/cuda_vector.h"
+#include "cusparse.h"
+
+namespace ceres::internal {
+
+// A sparse matrix hosted on the GPU in compressed row sparse format, with
+// CUDA-accelerated operations.
+class CERES_NO_EXPORT CudaSparseMatrix {
+ public:
+
+  // Create a GPU copy of the matrix provided. The caller must ensure that
+  // InitCuda() has already been successfully called on context before calling
+  // this constructor.
+  CudaSparseMatrix(ContextImpl* context,
+                   const CompressedRowSparseMatrix& crs_matrix);
+
+  ~CudaSparseMatrix();
+
+  // y = y + Ax;
+  void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y);
+  // y = y + A'x;
+  void LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y);
+
+  int num_rows() const { return num_rows_; }
+  int num_cols() const { return num_cols_; }
+  int num_nonzeros() const { return num_nonzeros_; }
+
+  // If subsequent uses of this matrix involve only numerical changes and no
+  // structural changes, then this method can be used to copy the updated
+  // non-zero values -- the row and column index arrays are kept  the same. It
+  // is the caller's responsibility to ensure that the sparsity structure of the
+  // matrix is unchanged.
+  void CopyValuesFromCpu(const CompressedRowSparseMatrix& crs_matrix);
+
+  const cusparseSpMatDescr_t& descr() const { return descr_; }
+
+ private:
+
+  // Disable copy and assignment.
+  CudaSparseMatrix(const CudaSparseMatrix&) = delete;
+  CudaSparseMatrix& operator=(const CudaSparseMatrix&) = delete;
+
+  // y = y + op(M)x. op must be either CUSPARSE_OPERATION_NON_TRANSPOSE or
+  // CUSPARSE_OPERATION_TRANSPOSE.
+  void SpMv(cusparseOperation_t op, const CudaVector& x, CudaVector* y);
+
+  int num_rows_ = 0;
+  int num_cols_ = 0;
+  int num_nonzeros_ = 0;
+
+  // CSR row indices.
+  CudaBuffer<int32_t> rows_;
+  // CSR column indices.
+  CudaBuffer<int32_t> cols_;
+  // CSR values.
+  CudaBuffer<double> values_;
+
+  ContextImpl* context_ = nullptr;
+
+  // CuSparse object that describes this matrix.
+  cusparseSpMatDescr_t descr_ = nullptr;
+
+  CudaBuffer<uint8_t> spmv_buffer_;
+};
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
+#endif  // CERES_INTERNAL_CUDA_SPARSE_MATRIX_H_
diff --git a/internal/ceres/cuda_sparse_matrix_test.cc b/internal/ceres/cuda_sparse_matrix_test.cc
new file mode 100644
index 0000000..3a01ff9
--- /dev/null
+++ b/internal/ceres/cuda_sparse_matrix_test.cc
@@ -0,0 +1,315 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include <string>
+
+#include "ceres/casts.h"
+#include "ceres/internal/config.h"
+#include "ceres/internal/eigen.h"
+#include "ceres/linear_least_squares_problems.h"
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/cuda_vector.h"
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/triplet_sparse_matrix.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace ceres {
+namespace internal {
+
+#ifndef CERES_NO_CUDA
+
+class CudaSparseMatrixTest : public ::testing::Test {
+ protected:
+  void SetUp() final {
+    std::string message;
+    CHECK(context_.InitCUDA(&message))
+        << "InitCUDA() failed because: " << message;
+    std::unique_ptr<LinearLeastSquaresProblem> problem =
+        CreateLinearLeastSquaresProblemFromId(2);
+    CHECK(problem != nullptr);
+    A_.reset(down_cast<BlockSparseMatrix*>(problem->A.release()));
+    CHECK(A_ != nullptr);
+    CHECK(problem->b != nullptr);
+    CHECK(problem->x != nullptr);
+    b_.resize(A_->num_rows());
+    for (int i = 0; i < A_->num_rows(); ++i) {
+      b_[i] = problem->b[i];
+    }
+    x_.resize(A_->num_cols());
+    for (int i = 0; i < A_->num_cols(); ++i) {
+      x_[i] = problem->x[i];
+    }
+    CHECK_EQ(A_->num_rows(), b_.rows());
+    CHECK_EQ(A_->num_cols(), x_.rows());
+  }
+
+  std::unique_ptr<BlockSparseMatrix> A_;
+  Vector x_;
+  Vector b_;
+  ContextImpl context_;
+};
+
+TEST_F(CudaSparseMatrixTest, RightMultiplyAndAccumulate) {
+  std::string message;
+  CompressedRowSparseMatrix A_crs(
+      A_->num_rows(), A_->num_cols(), A_->num_nonzeros());
+  A_->ToCompressedRowSparseMatrix(&A_crs);
+  CudaSparseMatrix A_gpu(&context_, A_crs);
+  CudaVector x_gpu(&context_, A_gpu.num_cols());
+  CudaVector res_gpu(&context_, A_gpu.num_rows());
+  x_gpu.CopyFromCpu(x_);
+
+  const Vector minus_b = -b_;
+  // res = -b
+  res_gpu.CopyFromCpu(minus_b);
+  // res += A * x
+  A_gpu.RightMultiplyAndAccumulate(x_gpu, &res_gpu);
+
+  Vector res;
+  res_gpu.CopyTo(&res);
+
+  Vector res_expected = minus_b;
+  A_->RightMultiplyAndAccumulate(x_.data(), res_expected.data());
+
+  EXPECT_LE((res - res_expected).norm(),
+            std::numeric_limits<double>::epsilon() * 1e3);
+}
+
+TEST(CudaSparseMatrix, CopyValuesFromCpu) {
+  // A1:
+  // [ 1 1 0 0
+  //   0 1 1 0]
+  // A2:
+  // [ 1 2 0 0
+  //   0 3 4 0]
+  // b: [1 2 3 4]'
+  // A1 * b = [3 5]'
+  // A2 * b = [5 18]'
+  TripletSparseMatrix A1(
+    2,
+    4,
+    {0, 0, 1, 1},
+    {0, 1, 1, 2},
+    {1, 1, 1, 1}
+  );
+  TripletSparseMatrix A2(
+    2,
+    4,
+    {0, 0, 1, 1},
+    {0, 1, 1, 2},
+    {1, 2, 3, 4}
+  );
+  Vector b(4);
+  b << 1, 2, 3, 4;
+
+
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  auto A1_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A1);
+  CudaSparseMatrix A_gpu(&context, *A1_crs);
+  CudaVector b_gpu(&context, A1.num_cols());
+  CudaVector x_gpu(&context, A1.num_rows());
+  b_gpu.CopyFromCpu(b);
+  x_gpu.SetZero();
+
+  Vector x_expected(2);
+  x_expected << 3, 5;
+  A_gpu.RightMultiplyAndAccumulate(b_gpu, &x_gpu);
+  Vector x_computed;
+  x_gpu.CopyTo(&x_computed);
+  EXPECT_EQ(x_computed, x_expected);
+
+  auto A2_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A2);
+  A_gpu.CopyValuesFromCpu(*A2_crs);
+  x_gpu.SetZero();
+  x_expected << 5, 18;
+  A_gpu.RightMultiplyAndAccumulate(b_gpu, &x_gpu);
+  x_gpu.CopyTo(&x_computed);
+  EXPECT_EQ(x_computed, x_expected);
+}
+
+TEST(CudaSparseMatrix, RightMultiplyAndAccumulate) {
+  // A:
+  // [ 1 2 0 0
+  //   0 3 4 0]
+  // b: [1 2 3 4]'
+  // A * b = [5 18]'
+  TripletSparseMatrix A(
+    2,
+    4,
+    {0, 0, 1, 1},
+    {0, 1, 1, 2},
+    {1, 2, 3, 4}
+  );
+  Vector b(4);
+  b << 1, 2, 3, 4;
+  Vector x_expected(2);
+  x_expected << 5, 18;
+
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A);
+  CudaSparseMatrix A_gpu(&context, *A_crs);
+  CudaVector b_gpu(&context, A.num_cols());
+  CudaVector x_gpu(&context, A.num_rows());
+  b_gpu.CopyFromCpu(b);
+  x_gpu.SetZero();
+
+  A_gpu.RightMultiplyAndAccumulate(b_gpu, &x_gpu);
+
+  Vector x_computed;
+  x_gpu.CopyTo(&x_computed);
+
+  EXPECT_EQ(x_computed, x_expected);
+}
+
+TEST(CudaSparseMatrix, LeftMultiplyAndAccumulate) {
+  // A:
+  // [ 1 2 0 0
+  //   0 3 4 0]
+  // b: [1 2]'
+  // A'* b = [1 8 8 0]'
+  TripletSparseMatrix A(
+    2,
+    4,
+    {0, 0, 1, 1},
+    {0, 1, 1, 2},
+    {1, 2, 3, 4}
+  );
+  Vector b(2);
+  b << 1, 2;
+  Vector x_expected(4);
+  x_expected << 1, 8, 8, 0;
+
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message))
+      << "InitCUDA() failed because: " << message;
+  auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A);
+  CudaSparseMatrix A_gpu(&context, *A_crs);
+  CudaVector b_gpu(&context, A.num_rows());
+  CudaVector x_gpu(&context, A.num_cols());
+  b_gpu.CopyFromCpu(b);
+  x_gpu.SetZero();
+
+  A_gpu.LeftMultiplyAndAccumulate(b_gpu, &x_gpu);
+
+  Vector x_computed;
+  x_gpu.CopyTo(&x_computed);
+
+  EXPECT_EQ(x_computed, x_expected);
+}
+
+// If there are numerical errors due to synchronization issues, they will show
+// up when testing with large matrices, since each operation will take
+// significant time, thus hopefully revealing any potential synchronization
+// issues.
+TEST(CudaSparseMatrix, LargeMultiplyAndAccumulate) {
+  // Create a large NxN matrix A that has the following structure:
+  // In row i, only columns i and i+1 are non-zero.
+  // A_{i, i} = A_{i, i+1} = 1.
+  // There will be 2 * N - 1 non-zero elements in A.
+  // X = [1:N]
+  // Right multiply test:
+  // b = A * X
+  // Left multiply test:
+  // b = A' * X
+
+  const int N = 10 * 1000 * 1000;
+  const int num_non_zeros = 2 * N - 1;
+  std::vector<int> row_indices(num_non_zeros);
+  std::vector<int> col_indices(num_non_zeros);
+  std::vector<double> values(num_non_zeros);
+
+  for (int i = 0; i < N; ++i) {
+    row_indices[2 * i] = i;
+    col_indices[2 * i] = i;
+    values[2 * i] = 1.0;
+    if (i + 1 < N) {
+      col_indices[2 * i + 1] = i + 1;
+      row_indices[2 * i + 1] = i;
+      values[2 * i + 1] = 1;
+    }
+  }
+  TripletSparseMatrix A(N, N, row_indices, col_indices, values);
+  Vector x(N);
+  for (int i = 0; i < N; ++i) {
+    x[i] = i + 1;
+  }
+
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message))
+      << "InitCUDA() failed because: " << message;
+  auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A);
+  CudaSparseMatrix A_gpu(&context, *A_crs);
+  CudaVector b_gpu(&context, N);
+  CudaVector x_gpu(&context, N);
+  x_gpu.CopyFromCpu(x);
+
+
+  // First check RightMultiply.
+  {
+    b_gpu.SetZero();
+    A_gpu.RightMultiplyAndAccumulate(x_gpu, &b_gpu);
+    Vector b_computed;
+    b_gpu.CopyTo(&b_computed);
+    for (int i = 0; i < N; ++i) {
+      if (i + 1 < N) {
+        EXPECT_EQ(b_computed[i], 2 * (i + 1) + 1);
+      } else {
+        EXPECT_EQ(b_computed[i], i + 1);
+      }
+    }
+  }
+
+  // Next check LeftMultiply.
+  {
+    b_gpu.SetZero();
+    A_gpu.LeftMultiplyAndAccumulate(x_gpu, &b_gpu);
+    Vector b_computed;
+    b_gpu.CopyTo(&b_computed);
+    for (int i = 0; i < N; ++i) {
+      if (i > 0) {
+        EXPECT_EQ(b_computed[i], 2 * (i + 1) - 1);
+      } else {
+        EXPECT_EQ(b_computed[i], i + 1);
+      }
+    }
+  }
+}
+
+#endif  // CERES_NO_CUDA
+
+}  // namespace internal
+}  // namespace ceres
diff --git a/internal/ceres/sparse_linear_operator_benchmark.cc b/internal/ceres/sparse_linear_operator_benchmark.cc
new file mode 100644
index 0000000..c224e35
--- /dev/null
+++ b/internal/ceres/sparse_linear_operator_benchmark.cc
@@ -0,0 +1,239 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include <iostream>
+#include <memory>
+#include <random>
+#include <string>
+
+#include "Eigen/Dense"
+#include "gflags/gflags.h"
+#include "benchmark/benchmark.h"
+#include "ceres/context_impl.h"
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/cuda_vector.h"
+#include "ceres/internal/config.h"
+#include "ceres/internal/eigen.h"
+#include "ceres/linear_solver.h"
+#include "cuda_runtime.h"
+
+namespace ceres::internal {
+
+// TODO(Joydeep Biswas): Add a matrix of benchmark sizes to test.
+
+namespace {
+// Generate a synthetic BA-style Jacobian with n camera poses, m landmarks, n_d
+// parameters per camera, m_d parameters per landmark, and k residuals per
+// camera.
+// TODO: Unify the synthetic Jacobian generation code with the code from
+// schur_eliminator_benchmark.cc since they are very similar.
+std::unique_ptr<BlockSparseMatrix> GenerateSyntheticJacobian(
+    int n, int m, int n_d, int m_d, int k) {
+  static const int kResidualSize = 2;
+  CompressedRowBlockStructure* bs = new CompressedRowBlockStructure;
+  int c = 0;
+  // Add column blocks for each camera.
+  for (int i = 0; i < n; ++i) {
+    bs->cols.push_back(Block(n_d, c));
+    c += n_d;
+  }
+  // Add column blocks for each landmark.
+  for (int i = 0; i < m; ++i) {
+    bs->cols.push_back(Block(m_d, c));
+    c += m_d;
+  }
+  // Total number of row blocks = k * n.
+  bs->rows.resize(k * n);
+  int values_offset = 0;
+  std::mt19937 prng;
+  std::uniform_real_distribution uniform_0_m(0.0, static_cast<double>(m));
+  // Generate structure of the Jacobian.
+  // For n cameras:
+  for (int i = 0; i < n; ++i) {
+    const int camera_block_id = i;
+    // For k residuals per camera:
+    for (int j = 0; j < k; ++j) {
+      // Pick the landmark of the residual randomly from [0, m).
+      const int landmark_id = uniform_0_m(prng);
+      const int landmark_block_id = n + landmark_id;
+      const int row_idx = i * k + j;
+      const int row = kResidualSize * row_idx;
+      bs->rows[row_idx].block = Block(kResidualSize, row);
+      bs->rows[row_idx].cells.resize(2);
+      // The camera part of the jacobian of this residual.
+      bs->rows[row_idx].cells[0] = Cell(camera_block_id, values_offset);
+      values_offset += n_d * kResidualSize;
+      // The landmark part of the jacobian of this residual.
+      bs->rows[row_idx].cells[1] = Cell(landmark_block_id, values_offset);
+      values_offset += m_d * kResidualSize;
+    }
+  }
+  std::unique_ptr<BlockSparseMatrix> jacobian =
+      std::make_unique<BlockSparseMatrix>(bs);
+  VectorRef(jacobian->mutable_values(), jacobian->num_nonzeros()).setRandom();
+  return jacobian;
+}
+}  // namespace
+
+DEFINE_int32(num_cameras, 1000, "Number of cameras.");
+DEFINE_int32(num_landmarks, 10000, "Number of landmarks.");
+DEFINE_int32(num_parameters_per_camera, 6, "Number of parameters per camera.");
+DEFINE_int32(num_parameters_per_landmark,
+             3,
+             "Number of parameters per landmark.");
+DEFINE_int32(num_residuals_per_camera, 100, "Number of residuals per camera.");
+
+static void BM_CpuRightMultiplyAndAccumulate(benchmark::State& state) {
+  // Perform setup here
+  std::unique_ptr<BlockSparseMatrix> jacobian =
+      GenerateSyntheticJacobian(FLAGS_num_cameras,
+                                FLAGS_num_landmarks,
+                                FLAGS_num_parameters_per_camera,
+                                FLAGS_num_parameters_per_landmark,
+                                FLAGS_num_residuals_per_camera);
+  Vector x(jacobian->num_cols());
+  Vector y(jacobian->num_rows());
+  x.setRandom();
+  y.setRandom();
+  double sum = 0;
+  for (auto _ : state) {
+    // This code gets timed
+    jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
+    sum += y.norm();
+  }
+  CHECK_NE(sum, 0.0);
+}
+
+static void BM_CpuLeftMultiplyAndAccumulate(benchmark::State& state) {
+  // Perform setup here
+  std::unique_ptr<BlockSparseMatrix> jacobian =
+      GenerateSyntheticJacobian(FLAGS_num_cameras,
+                                FLAGS_num_landmarks,
+                                FLAGS_num_parameters_per_camera,
+                                FLAGS_num_parameters_per_landmark,
+                                FLAGS_num_residuals_per_camera);
+  Vector x(jacobian->num_rows());
+  Vector y(jacobian->num_cols());
+  x.setRandom();
+  y.setRandom();
+  double sum = 0;
+  for (auto _ : state) {
+    // This code gets timed
+    jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
+    sum += y.norm();
+  }
+  CHECK_NE(sum, 0.0);
+}
+
+static void BM_CudaRightMultiplyAndAccumulate(benchmark::State& state) {
+  // Perform setup here
+  std::unique_ptr<BlockSparseMatrix> jacobian =
+      GenerateSyntheticJacobian(FLAGS_num_cameras,
+                                FLAGS_num_landmarks,
+                                FLAGS_num_parameters_per_camera,
+                                FLAGS_num_parameters_per_landmark,
+                                FLAGS_num_residuals_per_camera);
+  ContextImpl context;
+  std::string message;
+  context.InitCUDA(&message);
+  CompressedRowSparseMatrix jacobian_crs(
+      jacobian->num_rows(),
+      jacobian->num_cols(),
+      jacobian->num_nonzeros());
+  jacobian->ToCompressedRowSparseMatrix(&jacobian_crs);
+  CudaSparseMatrix cuda_jacobian(&context, jacobian_crs);
+  CudaVector cuda_x(&context, 0);
+  CudaVector cuda_y(&context, 0);
+
+  Vector x(jacobian->num_cols());
+  Vector y(jacobian->num_rows());
+  x.setRandom();
+  y.setRandom();
+
+  cuda_x.CopyFromCpu(x);
+  cuda_y.CopyFromCpu(y);
+  double sum = 0;
+  for (auto _ : state) {
+    // This code gets timed
+    cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y);
+    sum += cuda_y.Norm();
+    CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
+  }
+  CHECK_NE(sum, 0.0);
+}
+
+static void BM_CudaLeftMultiplyAndAccumulate(benchmark::State& state) {
+  // Perform setup here
+  std::unique_ptr<BlockSparseMatrix> jacobian =
+      GenerateSyntheticJacobian(FLAGS_num_cameras,
+                                FLAGS_num_landmarks,
+                                FLAGS_num_parameters_per_camera,
+                                FLAGS_num_parameters_per_landmark,
+                                FLAGS_num_residuals_per_camera);
+  ContextImpl context;
+  std::string message;
+  context.InitCUDA(&message);
+  CompressedRowSparseMatrix jacobian_crs(
+      jacobian->num_rows(),
+      jacobian->num_cols(),
+      jacobian->num_nonzeros());
+  jacobian->ToCompressedRowSparseMatrix(&jacobian_crs);
+  CudaSparseMatrix cuda_jacobian(&context, jacobian_crs);
+  CudaVector cuda_x(&context, 0);
+  CudaVector cuda_y(&context, 0);
+
+  Vector x(jacobian->num_rows());
+  Vector y(jacobian->num_cols());
+  x.setRandom();
+  y.setRandom();
+
+  cuda_x.CopyFromCpu(x);
+  cuda_y.CopyFromCpu(y);
+  double sum = 0;
+  for (auto _ : state) {
+    // This code gets timed
+    cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
+    sum += cuda_y.Norm();
+    CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
+  }
+  CHECK_NE(sum, 0.0);
+}
+
+BENCHMARK(BM_CpuRightMultiplyAndAccumulate);
+BENCHMARK(BM_CpuLeftMultiplyAndAccumulate);
+BENCHMARK(BM_CudaRightMultiplyAndAccumulate);
+BENCHMARK(BM_CudaLeftMultiplyAndAccumulate);
+
+BENCHMARK_MAIN();
+
+}  // namespace ceres::internal
+
+BENCHMARK_MAIN();