Update CudaSparseMatrix class - Perform temporary buffer size estimation only once - Allow construction from existing buffers with col/row structure Change-Id: I73c291328f1e8ed9184aba5d7058df71cbc6a15d

diff --git a/internal/ceres/cuda_block_sparse_crs_view.cc b/internal/ceres/cuda_block_sparse_crs_view.cc
index 6d4c6b0..f0da35f 100644
--- a/internal/ceres/cuda_block_sparse_crs_view.cc
+++ b/internal/ceres/cuda_block_sparse_crs_view.cc

@@ -41,16 +41,16 @@
     : context_(context) {
   block_structure_ = std::make_unique<CudaBlockSparseStructure>(
       *bsm.block_structure(), context);
-  crs_matrix_ = std::make_unique<CudaSparseMatrix>(
-      bsm.num_rows(), bsm.num_cols(), bsm.num_nonzeros(), context);
+  CudaBuffer<int32_t> rows(context, bsm.num_rows() + 1);
+  CudaBuffer<int32_t> cols(context, bsm.num_nonzeros());
   FillCRSStructure(block_structure_->num_row_blocks(),
                    bsm.num_rows(),
                    block_structure_->first_cell_in_row_block(),
                    block_structure_->cells(),
                    block_structure_->row_blocks(),
                    block_structure_->col_blocks(),
-                   crs_matrix_->mutable_rows(),
-                   crs_matrix_->mutable_cols(),
+                   rows.data(),
+                   cols.data(),
                    context->DefaultStream());
   is_crs_compatible_ = block_structure_->IsCrsCompatible();
   // if matrix is crs-compatible - we can drop block-structure and don't need
@@ -63,6 +63,8 @@
     streamed_buffer_ = std::make_unique<CudaStreamedBuffer<double>>(
         context_, kMaxTemporaryArraySize);
   }
+  crs_matrix_ = std::make_unique<CudaSparseMatrix>(
+      bsm.num_cols(), std::move(rows), std::move(cols), context);
   UpdateValues(bsm);
 }
 

diff --git a/internal/ceres/cuda_block_sparse_crs_view.h b/internal/ceres/cuda_block_sparse_crs_view.h
index 2ae8721..58ef618 100644
--- a/internal/ceres/cuda_block_sparse_crs_view.h
+++ b/internal/ceres/cuda_block_sparse_crs_view.h

@@ -80,6 +80,14 @@
   // Returns true if block-sparse matrix had CRS-compatible value layout
   bool IsCrsCompatible() const { return is_crs_compatible_; }
 
+  void LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const {
+    crs_matrix()->LeftMultiplyAndAccumulate(x, y);
+  }
+
+  void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const {
+    crs_matrix()->RightMultiplyAndAccumulate(x, y);
+  }
+
  private:
   // Value permutation kernel performs a single element-wise operation per
   // thread, thus performing permutation in blocks of 8 megabytes of

diff --git a/internal/ceres/cuda_block_sparse_crs_view_test.cc b/internal/ceres/cuda_block_sparse_crs_view_test.cc
index dd49ee8..07e8513 100644
--- a/internal/ceres/cuda_block_sparse_crs_view_test.cc
+++ b/internal/ceres/cuda_block_sparse_crs_view_test.cc

@@ -99,7 +99,7 @@
               1);
   }
 
-  void Compare(const BlockSparseMatrix& bsm, CudaSparseMatrix& csm) {
+  void Compare(const BlockSparseMatrix& bsm, const CudaSparseMatrix& csm) {
     ASSERT_EQ(csm.num_cols(), bsm.num_cols());
     ASSERT_EQ(csm.num_rows(), bsm.num_rows());
     ASSERT_EQ(csm.num_nonzeros(), bsm.num_nonzeros());
@@ -138,7 +138,7 @@
       CudaBlockSparseCRSView(*block_sparse_non_crs_compatible_, &context_);
   ASSERT_EQ(view.IsCrsCompatible(), false);
 
-  auto matrix = view.mutable_crs_matrix();
+  auto matrix = view.crs_matrix();
   Compare(*block_sparse_non_crs_compatible_, *matrix);
 }
 
@@ -147,7 +147,7 @@
       CudaBlockSparseCRSView(*block_sparse_crs_compatible_rows_, &context_);
   ASSERT_EQ(view.IsCrsCompatible(), true);
 
-  auto matrix = view.mutable_crs_matrix();
+  auto matrix = view.crs_matrix();
   Compare(*block_sparse_crs_compatible_rows_, *matrix);
 }
 
@@ -156,7 +156,7 @@
                                      &context_);
   ASSERT_EQ(view.IsCrsCompatible(), true);
 
-  auto matrix = view.mutable_crs_matrix();
+  auto matrix = view.crs_matrix();
   Compare(*block_sparse_crs_compatible_single_cell_, *matrix);
 }
 }  // namespace ceres::internal

diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h
index d6abb15..98cab5d 100644
--- a/internal/ceres/cuda_buffer.h
+++ b/internal/ceres/cuda_buffer.h

@@ -55,6 +55,13 @@
   CudaBuffer(ContextImpl* context, int size) : context_(context) {
     Reserve(size);
   }
+
+  CudaBuffer(CudaBuffer&& other)
+      : data_(other.data_), size_(other.size_), context_(other.context_) {
+    other.data_ = nullptr;
+    other.size_ = 0;
+  }
+
   CudaBuffer(const CudaBuffer&) = delete;
   CudaBuffer& operator=(const CudaBuffer&) = delete;
 
@@ -162,4 +169,4 @@
 
 #endif  // CERES_NO_CUDA
 
-#endif  // CERES_INTERNAL_CUDA_BUFFER_H_
\ No newline at end of file
+#endif  // CERES_INTERNAL_CUDA_BUFFER_H_

diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
index 703be84..9153544 100644
--- a/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
+++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc

@@ -59,10 +59,11 @@
                              ? bs.cols[num_col_blocks_e].position
                              : bsm.num_cols();
   const int num_cols_f = bsm.num_cols() - num_cols_e;
-  matrix_e_ = std::make_unique<CudaSparseMatrix>(
-      num_rows, num_cols_e, num_nonzeros_e, context);
-  matrix_f_ = std::make_unique<CudaSparseMatrix>(
-      num_rows, num_cols_f, num_nonzeros_f, context);
+
+  CudaBuffer<int32_t> rows_e(context, num_rows + 1);
+  CudaBuffer<int32_t> cols_e(context, num_nonzeros_e);
+  CudaBuffer<int32_t> rows_f(context, num_rows + 1);
+  CudaBuffer<int32_t> cols_f(context, num_nonzeros_f);
 
   num_row_blocks_e_ = block_structure_->num_row_blocks_e();
   FillCRSStructurePartitioned(block_structure_->num_row_blocks(),
@@ -74,10 +75,10 @@
                               block_structure_->cells(),
                               block_structure_->row_blocks(),
                               block_structure_->col_blocks(),
-                              matrix_e_->mutable_rows(),
-                              matrix_e_->mutable_cols(),
-                              matrix_f_->mutable_rows(),
-                              matrix_f_->mutable_cols(),
+                              rows_e.data(),
+                              cols_e.data(),
+                              rows_f.data(),
+                              cols_f.data(),
                               context->DefaultStream());
   f_is_crs_compatible_ = block_structure_->IsCrsCompatible();
   if (f_is_crs_compatible_) {
@@ -86,6 +87,11 @@
     streamed_buffer_ = std::make_unique<CudaStreamedBuffer<double>>(
         context, kMaxTemporaryArraySize);
   }
+  matrix_e_ = std::make_unique<CudaSparseMatrix>(
+      num_cols_e, std::move(rows_e), std::move(cols_e), context);
+  matrix_f_ = std::make_unique<CudaSparseMatrix>(
+      num_cols_f, std::move(rows_f), std::move(cols_f), context);
+
   CHECK_EQ(bsm.num_nonzeros(),
            matrix_e_->num_nonzeros() + matrix_f_->num_nonzeros());
 

diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc b/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc
index 5090b6a..ddfdeef 100644
--- a/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc
+++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc

@@ -196,11 +196,8 @@
     const int num_cols_e = bs.cols[num_col_blocks_e].position;
     const int num_cols_f = num_cols - num_cols_e;
 
-    // TODO: we definitely would like to use matrix() here, but
-    // CudaSparseMatrix::RightMultiplyAndAccumulate is defined non-const because
-    // it might allocate additional storage by request of cuSPARSE
-    auto matrix_e = view.mutable_matrix_e();
-    auto matrix_f = view.mutable_matrix_f();
+    auto matrix_e = view.matrix_e();
+    auto matrix_f = view.matrix_f();
     ASSERT_EQ(matrix_e->num_cols(), num_cols_e);
     ASSERT_EQ(matrix_e->num_rows(), num_rows);
     ASSERT_EQ(matrix_f->num_cols(), num_cols_f);

diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc
index 905b4ab..2d6f2f8 100644
--- a/internal/ceres/cuda_sparse_matrix.cc
+++ b/internal/ceres/cuda_sparse_matrix.cc

@@ -58,63 +58,85 @@
 #include "cusparse.h"
 
 namespace ceres::internal {
+namespace {
+// Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of
+// CUSPARSE_SPMV_ALG_DEFAULT.
+#if CUDART_VERSION >= 11021
+const auto kSpMVAlgorithm = CUSPARSE_SPMV_ALG_DEFAULT;
+#else   // CUDART_VERSION >= 11021
+const auto kSpMVAlgorithm = CUSPARSE_MV_ALG_DEFAULT;
+#endif  // CUDART_VERSION >= 11021
+size_t GetTempBufferSizeForOp(const cusparseHandle_t& handle,
+                              const cusparseOperation_t op,
+                              const cusparseDnVecDescr_t& x,
+                              const cusparseDnVecDescr_t& y,
+                              const cusparseSpMatDescr_t& A) {
+  size_t buffer_size;
+  const double alpha = 1.0;
+  const double beta = 1.0;
+  CHECK_NE(A, nullptr);
+  CHECK_EQ(cusparseSpMV_bufferSize(handle,
+                                   op,
+                                   &alpha,
+                                   A,
+                                   x,
+                                   &beta,
+                                   y,
+                                   CUDA_R_64F,
+                                   kSpMVAlgorithm,
+                                   &buffer_size),
+           CUSPARSE_STATUS_SUCCESS);
+  return buffer_size;
+}
 
-CudaSparseMatrix::CudaSparseMatrix(int num_rows,
-                                   int num_cols,
-                                   int num_nonzeros,
+size_t GetTempBufferSize(const cusparseHandle_t& handle,
+                         const cusparseDnVecDescr_t& left,
+                         const cusparseDnVecDescr_t& right,
+                         const cusparseSpMatDescr_t& A) {
+  CHECK_NE(A, nullptr);
+  return std::max(GetTempBufferSizeForOp(
+                      handle, CUSPARSE_OPERATION_NON_TRANSPOSE, right, left, A),
+                  GetTempBufferSizeForOp(
+                      handle, CUSPARSE_OPERATION_TRANSPOSE, left, right, A));
+}
+}  // namespace
+
+CudaSparseMatrix::CudaSparseMatrix(int num_cols,
+                                   CudaBuffer<int32_t>&& rows,
+                                   CudaBuffer<int32_t>&& cols,
                                    ContextImpl* context)
-    : num_rows_(num_rows),
+    : num_rows_(rows.size() - 1),
       num_cols_(num_cols),
-      num_nonzeros_(num_nonzeros),
+      num_nonzeros_(cols.size()),
       context_(context),
-      rows_(context, num_rows + 1),
-      cols_(context, num_nonzeros),
-      values_(context, num_nonzeros),
+      rows_(std::move(rows)),
+      cols_(std::move(cols)),
+      values_(context, num_nonzeros_),
       spmv_buffer_(context) {
-  cusparseCreateCsr(&descr_,
-                    num_rows_,
-                    num_cols_,
-                    num_nonzeros_,
-                    rows_.data(),
-                    cols_.data(),
-                    values_.data(),
-                    CUSPARSE_INDEX_32I,
-                    CUSPARSE_INDEX_32I,
-                    CUSPARSE_INDEX_BASE_ZERO,
-                    CUDA_R_64F);
+  Initialize();
 }
 
 CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context,
                                    const CompressedRowSparseMatrix& crs_matrix)
-    : context_(context),
-      rows_{context},
-      cols_{context},
-      values_{context},
-      spmv_buffer_{context} {
-  DCHECK_NE(context, nullptr);
-  CHECK(context->IsCudaInitialized());
-  num_rows_ = crs_matrix.num_rows();
-  num_cols_ = crs_matrix.num_cols();
-  num_nonzeros_ = crs_matrix.num_nonzeros();
+    : num_rows_(crs_matrix.num_rows()),
+      num_cols_(crs_matrix.num_cols()),
+      num_nonzeros_(crs_matrix.num_nonzeros()),
+      context_(context),
+      rows_(context, num_rows_ + 1),
+      cols_(context, num_nonzeros_),
+      values_(context, num_nonzeros_),
+      spmv_buffer_(context) {
   rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1);
   cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_);
   values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
-  cusparseCreateCsr(&descr_,
-                    num_rows_,
-                    num_cols_,
-                    num_nonzeros_,
-                    rows_.data(),
-                    cols_.data(),
-                    values_.data(),
-                    CUSPARSE_INDEX_32I,
-                    CUSPARSE_INDEX_32I,
-                    CUSPARSE_INDEX_BASE_ZERO,
-                    CUDA_R_64F);
+  Initialize();
 }
 
 CudaSparseMatrix::~CudaSparseMatrix() {
   CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS);
   descr_ = nullptr;
+  CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_left_));
+  CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_right_));
 }
 
 void CudaSparseMatrix::CopyValuesFromCpu(
@@ -128,58 +150,76 @@
   values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
 }
 
+void CudaSparseMatrix::Initialize() {
+  CHECK(context_->IsCudaInitialized());
+  CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
+           cusparseCreateCsr(&descr_,
+                             num_rows_,
+                             num_cols_,
+                             num_nonzeros_,
+                             rows_.data(),
+                             cols_.data(),
+                             values_.data(),
+                             CUSPARSE_INDEX_32I,
+                             CUSPARSE_INDEX_32I,
+                             CUSPARSE_INDEX_BASE_ZERO,
+                             CUDA_R_64F));
+
+  // Note: values_.data() is used as non-zero pointer to device memory
+  // When there is no non-zero values, data-pointer of values_ array will be a
+  // nullptr; but in this case left/right products are trivial and temporary
+  // buffer (and vector descriptors) is not required
+  if (!num_nonzeros_) return;
+
+  CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
+           cusparseCreateDnVec(
+               &descr_vec_left_, num_rows_, values_.data(), CUDA_R_64F));
+  CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
+           cusparseCreateDnVec(
+               &descr_vec_right_, num_cols_, values_.data(), CUDA_R_64F));
+  size_t buffer_size = GetTempBufferSize(
+      context_->cusparse_handle_, descr_vec_left_, descr_vec_right_, descr_);
+  spmv_buffer_.Reserve(buffer_size);
+}
+
 void CudaSparseMatrix::SpMv(cusparseOperation_t op,
-                            const CudaVector& x,
-                            CudaVector* y) {
+                            const cusparseDnVecDescr_t& x,
+                            const cusparseDnVecDescr_t& y) const {
   size_t buffer_size = 0;
   const double alpha = 1.0;
   const double beta = 1.0;
 
-  // Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of
-  // CUSPARSE_SPMV_ALG_DEFAULT.
-#if CUDART_VERSION >= 11021
-  const auto algorithm = CUSPARSE_SPMV_ALG_DEFAULT;
-#else   // CUDART_VERSION >= 11021
-  const auto algorithm = CUSPARSE_MV_ALG_DEFAULT;
-#endif  // CUDART_VERSION >= 11021
-
-  CHECK_EQ(cusparseSpMV_bufferSize(context_->cusparse_handle_,
-                                   op,
-                                   &alpha,
-                                   descr_,
-                                   x.descr(),
-                                   &beta,
-                                   y->descr(),
-                                   CUDA_R_64F,
-                                   algorithm,
-                                   &buffer_size),
-           CUSPARSE_STATUS_SUCCESS);
-  spmv_buffer_.Reserve(buffer_size);
   CHECK_EQ(cusparseSpMV(context_->cusparse_handle_,
                         op,
                         &alpha,
                         descr_,
-                        x.descr(),
+                        x,
                         &beta,
-                        y->descr(),
+                        y,
                         CUDA_R_64F,
-                        algorithm,
+                        kSpMVAlgorithm,
                         spmv_buffer_.data()),
            CUSPARSE_STATUS_SUCCESS);
 }
 
 void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x,
-                                                  CudaVector* y) {
-  SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x, y);
+                                                  CudaVector* y) const {
+  DCHECK(GetTempBufferSize(
+             context_->cusparse_handle_, y->descr(), x.descr(), descr_) <=
+         spmv_buffer_.size());
+  SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x.descr(), y->descr());
 }
 
 void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x,
-                                                 CudaVector* y) {
+                                                 CudaVector* y) const {
   // TODO(Joydeep Biswas): We should consider storing a transposed copy of the
   // matrix by converting CSR to CSC. From the cuSPARSE documentation:
   // "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA
   // != CUSPARSE_OPERATION_NON_TRANSPOSE"
-  SpMv(CUSPARSE_OPERATION_TRANSPOSE, x, y);
+  DCHECK(GetTempBufferSize(
+             context_->cusparse_handle_, x.descr(), y->descr(), descr_) <=
+         spmv_buffer_.size());
+  SpMv(CUSPARSE_OPERATION_TRANSPOSE, x.descr(), y->descr());
 }
 
 }  // namespace ceres::internal

diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h
index f5fcb91..d53afd1 100644
--- a/internal/ceres/cuda_sparse_matrix.h
+++ b/internal/ceres/cuda_sparse_matrix.h

@@ -56,27 +56,28 @@
 
 // A sparse matrix hosted on the GPU in compressed row sparse format, with
 // CUDA-accelerated operations.
+// The user of the class must ensure that ContextImpl::InitCuda() has already
+// been successfully called before using this class.
 class CERES_NO_EXPORT CudaSparseMatrix {
  public:
-  // Create a GPU copy of the matrix provided. The caller must ensure that
-  // InitCuda() has already been successfully called on context before calling
-  // this constructor.
+  // Create a GPU copy of the matrix provided.
   CudaSparseMatrix(ContextImpl* context,
                    const CompressedRowSparseMatrix& crs_matrix);
 
-  // Creates a "blank" matrix with an appropriate amount of memory allocated.
-  // The object itself is left in an inconsistent state.
-  CudaSparseMatrix(int num_rows,
-                   int num_cols,
-                   int num_nonzeros,
+  // Create matrix from existing row and column index buffers.
+  // Values are left uninitialized.
+  CudaSparseMatrix(int num_cols,
+                   CudaBuffer<int32_t>&& rows,
+                   CudaBuffer<int32_t>&& cols,
                    ContextImpl* context);
 
   ~CudaSparseMatrix();
 
+  // Left/right products are using internal buffer and are not thread-safe
   // y = y + Ax;
-  void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y);
+  void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const;
   // y = y + A'x;
-  void LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y);
+  void LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const;
 
   int num_rows() const { return num_rows_; }
   int num_cols() const { return num_cols_; }
@@ -104,9 +105,15 @@
   CudaSparseMatrix(const CudaSparseMatrix&) = delete;
   CudaSparseMatrix& operator=(const CudaSparseMatrix&) = delete;
 
+  // Allocate temporary buffer for left/right products, create cuSPARSE
+  // descriptors
+  void Initialize();
+
   // y = y + op(M)x. op must be either CUSPARSE_OPERATION_NON_TRANSPOSE or
   // CUSPARSE_OPERATION_TRANSPOSE.
-  void SpMv(cusparseOperation_t op, const CudaVector& x, CudaVector* y);
+  void SpMv(cusparseOperation_t op,
+            const cusparseDnVecDescr_t& x,
+            const cusparseDnVecDescr_t& y) const;
 
   int num_rows_ = 0;
   int num_cols_ = 0;
@@ -123,7 +130,11 @@
   // CuSparse object that describes this matrix.
   cusparseSpMatDescr_t descr_ = nullptr;
 
-  CudaBuffer<uint8_t> spmv_buffer_;
+  // Dense vector descriptors for pointer interface
+  cusparseDnVecDescr_t descr_vec_left_ = nullptr;
+  cusparseDnVecDescr_t descr_vec_right_ = nullptr;
+
+  mutable CudaBuffer<uint8_t> spmv_buffer_;
 };
 
 }  // namespace ceres::internal

diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc
index d434f36..b714aaf 100644
--- a/internal/ceres/cuda_vector.cc
+++ b/internal/ceres/cuda_vector.cc

@@ -57,6 +57,15 @@
   Resize(size);
 }
 
+CudaVector::CudaVector(CudaVector&& other)
+    : num_rows_(other.num_rows_),
+      context_(other.context_),
+      data_(std::move(other.data_)),
+      descr_(other.descr_) {
+  other.num_rows_ = 0;
+  other.descr_ = nullptr;
+}
+
 CudaVector& CudaVector::operator=(const CudaVector& other) {
   if (this != &other) {
     Resize(other.num_rows());
@@ -88,7 +97,7 @@
                       num_rows_,
                       data_.data(),
                       1,
-                      x.data().data(),
+                      x.data(),
                       1,
                       &result),
            CUBLAS_STATUS_SUCCESS)
@@ -105,13 +114,15 @@
   return result;
 }
 
+void CudaVector::CopyFromCpu(const double* x) {
+  data_.CopyFromCpu(x, num_rows_);
+}
+
 void CudaVector::CopyFromCpu(const Vector& x) {
-  data_.Reserve(x.rows());
-  data_.CopyFromCpu(x.data(), x.rows());
-  num_rows_ = x.rows();
-  DestroyDescriptor();
-  CHECK_EQ(cusparseCreateDnVec(&descr_, num_rows_, data_.data(), CUDA_R_64F),
-           CUSPARSE_STATUS_SUCCESS);
+  if (x.rows() != num_rows_) {
+    Resize(x.rows());
+  }
+  CopyFromCpu(x.data());
 }
 
 void CudaVector::CopyTo(Vector* x) const {
@@ -126,6 +137,8 @@
 }
 
 void CudaVector::SetZero() {
+  // Allow empty vector to be zeroed
+  if (num_rows_ == 0) return;
   CHECK(data_.data() != nullptr);
   CudaSetZeroFP64(data_.data(), num_rows_, context_->DefaultStream());
 }
@@ -147,7 +160,7 @@
   CHECK_EQ(cublasDaxpy(context_->cublas_handle_,
                        num_rows_,
                        &a,
-                       x.data().data(),
+                       x.data(),
                        1,
                        data_.data(),
                        1),
@@ -156,11 +169,8 @@
 }
 
 void CudaVector::DtDxpy(const CudaVector& D, const CudaVector& x) {
-  CudaDtDxpy(data_.data(),
-             D.data().data(),
-             x.data().data(),
-             num_rows_,
-             context_->DefaultStream());
+  CudaDtDxpy(
+      data_.data(), D.data(), x.data(), num_rows_, context_->DefaultStream());
 }
 
 void CudaVector::Scale(double s) {

diff --git a/internal/ceres/cuda_vector.h b/internal/ceres/cuda_vector.h
index 46661cf..cbf3b27 100644
--- a/internal/ceres/cuda_vector.h
+++ b/internal/ceres/cuda_vector.h

@@ -65,6 +65,8 @@
   // context before calling this method.
   CudaVector(ContextImpl* context, int size);
 
+  CudaVector(CudaVector&& other);
+
   ~CudaVector();
 
   void Resize(int size);
@@ -84,6 +86,9 @@
   // Copy from Eigen vector.
   void CopyFromCpu(const Vector& x);
 
+  // Copy from CPU memory array.
+  void CopyFromCpu(const double* x);
+
   // Copy to Eigen vector.
   void CopyTo(Vector* x) const;
 
@@ -103,7 +108,8 @@
   int num_rows() const { return num_rows_; }
   int num_cols() const { return 1; }
 
-  const CudaBuffer<double>& data() const { return data_; }
+  const double* data() const { return data_.data(); }
+  double* mutable_data() { return data_.data(); }
 
   const cusparseDnVecDescr_t& descr() const { return descr_; }
 

diff --git a/internal/ceres/cuda_vector_test.cc b/internal/ceres/cuda_vector_test.cc
index db1fec5..fd7c166 100644
--- a/internal/ceres/cuda_vector_test.cc
+++ b/internal/ceres/cuda_vector_test.cc

@@ -48,7 +48,7 @@
   CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x(&context, 1000);
   EXPECT_EQ(x.num_rows(), 1000);
-  EXPECT_NE(x.data().data(), nullptr);
+  EXPECT_NE(x.data(), nullptr);
 }
 
 TEST(CudaVector, CopyVector) {
@@ -67,6 +67,23 @@
   EXPECT_EQ(x, z);
 }
 
+TEST(CudaVector, Move) {
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+  CudaVector y(&context, 10);
+  const auto y_data = y.data();
+  const auto y_descr = y.descr();
+  EXPECT_EQ(y.num_rows(), 10);
+  CudaVector z(std::move(y));
+  EXPECT_EQ(y.data(), nullptr);
+  EXPECT_EQ(y.descr(), nullptr);
+  EXPECT_EQ(y.num_rows(), 0);
+
+  EXPECT_EQ(z.data(), y_data);
+  EXPECT_EQ(z.descr(), y_descr);
+}
+
 TEST(CudaVector, DeepCopy) {
   Vector x(3);
   x << 1, 2, 3;

diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index 28a8aff..46ebe13 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc

@@ -496,7 +496,7 @@
   cuda_x.CopyFromCpu(x);
   cuda_y.SetZero();
 
-  auto matrix = view.mutable_matrix_f();
+  auto matrix = view.matrix_f();
   for (auto _ : state) {
     matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
   }
@@ -522,7 +522,7 @@
   cuda_x.CopyFromCpu(x);
   cuda_y.SetZero();
 
-  auto matrix = view.mutable_matrix_f();
+  auto matrix = view.matrix_f();
   for (auto _ : state) {
     matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
   }
@@ -548,7 +548,7 @@
   cuda_x.CopyFromCpu(x);
   cuda_y.SetZero();
 
-  auto matrix = view.mutable_matrix_e();
+  auto matrix = view.matrix_e();
   for (auto _ : state) {
     matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
   }
@@ -574,7 +574,7 @@
   cuda_x.CopyFromCpu(x);
   cuda_y.SetZero();
 
-  auto matrix = view.mutable_matrix_e();
+  auto matrix = view.matrix_e();
   for (auto _ : state) {
     matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
   }