Update CudaSparseMatrix class - Perform temporary buffer size estimation only once - Allow construction from existing buffers with col/row structure Change-Id: I73c291328f1e8ed9184aba5d7058df71cbc6a15d
diff --git a/internal/ceres/cuda_block_sparse_crs_view.cc b/internal/ceres/cuda_block_sparse_crs_view.cc index 6d4c6b0..f0da35f 100644 --- a/internal/ceres/cuda_block_sparse_crs_view.cc +++ b/internal/ceres/cuda_block_sparse_crs_view.cc
@@ -41,16 +41,16 @@ : context_(context) { block_structure_ = std::make_unique<CudaBlockSparseStructure>( *bsm.block_structure(), context); - crs_matrix_ = std::make_unique<CudaSparseMatrix>( - bsm.num_rows(), bsm.num_cols(), bsm.num_nonzeros(), context); + CudaBuffer<int32_t> rows(context, bsm.num_rows() + 1); + CudaBuffer<int32_t> cols(context, bsm.num_nonzeros()); FillCRSStructure(block_structure_->num_row_blocks(), bsm.num_rows(), block_structure_->first_cell_in_row_block(), block_structure_->cells(), block_structure_->row_blocks(), block_structure_->col_blocks(), - crs_matrix_->mutable_rows(), - crs_matrix_->mutable_cols(), + rows.data(), + cols.data(), context->DefaultStream()); is_crs_compatible_ = block_structure_->IsCrsCompatible(); // if matrix is crs-compatible - we can drop block-structure and don't need @@ -63,6 +63,8 @@ streamed_buffer_ = std::make_unique<CudaStreamedBuffer<double>>( context_, kMaxTemporaryArraySize); } + crs_matrix_ = std::make_unique<CudaSparseMatrix>( + bsm.num_cols(), std::move(rows), std::move(cols), context); UpdateValues(bsm); }
diff --git a/internal/ceres/cuda_block_sparse_crs_view.h b/internal/ceres/cuda_block_sparse_crs_view.h index 2ae8721..58ef618 100644 --- a/internal/ceres/cuda_block_sparse_crs_view.h +++ b/internal/ceres/cuda_block_sparse_crs_view.h
@@ -80,6 +80,14 @@ // Returns true if block-sparse matrix had CRS-compatible value layout bool IsCrsCompatible() const { return is_crs_compatible_; } + void LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const { + crs_matrix()->LeftMultiplyAndAccumulate(x, y); + } + + void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const { + crs_matrix()->RightMultiplyAndAccumulate(x, y); + } + private: // Value permutation kernel performs a single element-wise operation per // thread, thus performing permutation in blocks of 8 megabytes of
diff --git a/internal/ceres/cuda_block_sparse_crs_view_test.cc b/internal/ceres/cuda_block_sparse_crs_view_test.cc index dd49ee8..07e8513 100644 --- a/internal/ceres/cuda_block_sparse_crs_view_test.cc +++ b/internal/ceres/cuda_block_sparse_crs_view_test.cc
@@ -99,7 +99,7 @@ 1); } - void Compare(const BlockSparseMatrix& bsm, CudaSparseMatrix& csm) { + void Compare(const BlockSparseMatrix& bsm, const CudaSparseMatrix& csm) { ASSERT_EQ(csm.num_cols(), bsm.num_cols()); ASSERT_EQ(csm.num_rows(), bsm.num_rows()); ASSERT_EQ(csm.num_nonzeros(), bsm.num_nonzeros()); @@ -138,7 +138,7 @@ CudaBlockSparseCRSView(*block_sparse_non_crs_compatible_, &context_); ASSERT_EQ(view.IsCrsCompatible(), false); - auto matrix = view.mutable_crs_matrix(); + auto matrix = view.crs_matrix(); Compare(*block_sparse_non_crs_compatible_, *matrix); } @@ -147,7 +147,7 @@ CudaBlockSparseCRSView(*block_sparse_crs_compatible_rows_, &context_); ASSERT_EQ(view.IsCrsCompatible(), true); - auto matrix = view.mutable_crs_matrix(); + auto matrix = view.crs_matrix(); Compare(*block_sparse_crs_compatible_rows_, *matrix); } @@ -156,7 +156,7 @@ &context_); ASSERT_EQ(view.IsCrsCompatible(), true); - auto matrix = view.mutable_crs_matrix(); + auto matrix = view.crs_matrix(); Compare(*block_sparse_crs_compatible_single_cell_, *matrix); } } // namespace ceres::internal
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h index d6abb15..98cab5d 100644 --- a/internal/ceres/cuda_buffer.h +++ b/internal/ceres/cuda_buffer.h
@@ -55,6 +55,13 @@ CudaBuffer(ContextImpl* context, int size) : context_(context) { Reserve(size); } + + CudaBuffer(CudaBuffer&& other) + : data_(other.data_), size_(other.size_), context_(other.context_) { + other.data_ = nullptr; + other.size_ = 0; + } + CudaBuffer(const CudaBuffer&) = delete; CudaBuffer& operator=(const CudaBuffer&) = delete; @@ -162,4 +169,4 @@ #endif // CERES_NO_CUDA -#endif // CERES_INTERNAL_CUDA_BUFFER_H_ \ No newline at end of file +#endif // CERES_INTERNAL_CUDA_BUFFER_H_
diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc index 703be84..9153544 100644 --- a/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc +++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
@@ -59,10 +59,11 @@ ? bs.cols[num_col_blocks_e].position : bsm.num_cols(); const int num_cols_f = bsm.num_cols() - num_cols_e; - matrix_e_ = std::make_unique<CudaSparseMatrix>( - num_rows, num_cols_e, num_nonzeros_e, context); - matrix_f_ = std::make_unique<CudaSparseMatrix>( - num_rows, num_cols_f, num_nonzeros_f, context); + + CudaBuffer<int32_t> rows_e(context, num_rows + 1); + CudaBuffer<int32_t> cols_e(context, num_nonzeros_e); + CudaBuffer<int32_t> rows_f(context, num_rows + 1); + CudaBuffer<int32_t> cols_f(context, num_nonzeros_f); num_row_blocks_e_ = block_structure_->num_row_blocks_e(); FillCRSStructurePartitioned(block_structure_->num_row_blocks(), @@ -74,10 +75,10 @@ block_structure_->cells(), block_structure_->row_blocks(), block_structure_->col_blocks(), - matrix_e_->mutable_rows(), - matrix_e_->mutable_cols(), - matrix_f_->mutable_rows(), - matrix_f_->mutable_cols(), + rows_e.data(), + cols_e.data(), + rows_f.data(), + cols_f.data(), context->DefaultStream()); f_is_crs_compatible_ = block_structure_->IsCrsCompatible(); if (f_is_crs_compatible_) { @@ -86,6 +87,11 @@ streamed_buffer_ = std::make_unique<CudaStreamedBuffer<double>>( context, kMaxTemporaryArraySize); } + matrix_e_ = std::make_unique<CudaSparseMatrix>( + num_cols_e, std::move(rows_e), std::move(cols_e), context); + matrix_f_ = std::make_unique<CudaSparseMatrix>( + num_cols_f, std::move(rows_f), std::move(cols_f), context); + CHECK_EQ(bsm.num_nonzeros(), matrix_e_->num_nonzeros() + matrix_f_->num_nonzeros());
diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc b/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc index 5090b6a..ddfdeef 100644 --- a/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc +++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc
@@ -196,11 +196,8 @@ const int num_cols_e = bs.cols[num_col_blocks_e].position; const int num_cols_f = num_cols - num_cols_e; - // TODO: we definitely would like to use matrix() here, but - // CudaSparseMatrix::RightMultiplyAndAccumulate is defined non-const because - // it might allocate additional storage by request of cuSPARSE - auto matrix_e = view.mutable_matrix_e(); - auto matrix_f = view.mutable_matrix_f(); + auto matrix_e = view.matrix_e(); + auto matrix_f = view.matrix_f(); ASSERT_EQ(matrix_e->num_cols(), num_cols_e); ASSERT_EQ(matrix_e->num_rows(), num_rows); ASSERT_EQ(matrix_f->num_cols(), num_cols_f);
diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc index 905b4ab..2d6f2f8 100644 --- a/internal/ceres/cuda_sparse_matrix.cc +++ b/internal/ceres/cuda_sparse_matrix.cc
@@ -58,63 +58,85 @@ #include "cusparse.h" namespace ceres::internal { +namespace { +// Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of +// CUSPARSE_SPMV_ALG_DEFAULT. +#if CUDART_VERSION >= 11021 +const auto kSpMVAlgorithm = CUSPARSE_SPMV_ALG_DEFAULT; +#else // CUDART_VERSION >= 11021 +const auto kSpMVAlgorithm = CUSPARSE_MV_ALG_DEFAULT; +#endif // CUDART_VERSION >= 11021 +size_t GetTempBufferSizeForOp(const cusparseHandle_t& handle, + const cusparseOperation_t op, + const cusparseDnVecDescr_t& x, + const cusparseDnVecDescr_t& y, + const cusparseSpMatDescr_t& A) { + size_t buffer_size; + const double alpha = 1.0; + const double beta = 1.0; + CHECK_NE(A, nullptr); + CHECK_EQ(cusparseSpMV_bufferSize(handle, + op, + &alpha, + A, + x, + &beta, + y, + CUDA_R_64F, + kSpMVAlgorithm, + &buffer_size), + CUSPARSE_STATUS_SUCCESS); + return buffer_size; +} -CudaSparseMatrix::CudaSparseMatrix(int num_rows, - int num_cols, - int num_nonzeros, +size_t GetTempBufferSize(const cusparseHandle_t& handle, + const cusparseDnVecDescr_t& left, + const cusparseDnVecDescr_t& right, + const cusparseSpMatDescr_t& A) { + CHECK_NE(A, nullptr); + return std::max(GetTempBufferSizeForOp( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, right, left, A), + GetTempBufferSizeForOp( + handle, CUSPARSE_OPERATION_TRANSPOSE, left, right, A)); +} +} // namespace + +CudaSparseMatrix::CudaSparseMatrix(int num_cols, + CudaBuffer<int32_t>&& rows, + CudaBuffer<int32_t>&& cols, ContextImpl* context) - : num_rows_(num_rows), + : num_rows_(rows.size() - 1), num_cols_(num_cols), - num_nonzeros_(num_nonzeros), + num_nonzeros_(cols.size()), context_(context), - rows_(context, num_rows + 1), - cols_(context, num_nonzeros), - values_(context, num_nonzeros), + rows_(std::move(rows)), + cols_(std::move(cols)), + values_(context, num_nonzeros_), spmv_buffer_(context) { - cusparseCreateCsr(&descr_, - num_rows_, - num_cols_, - num_nonzeros_, - rows_.data(), - cols_.data(), - values_.data(), - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_64F); + Initialize(); } CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context, const CompressedRowSparseMatrix& crs_matrix) - : context_(context), - rows_{context}, - cols_{context}, - values_{context}, - spmv_buffer_{context} { - DCHECK_NE(context, nullptr); - CHECK(context->IsCudaInitialized()); - num_rows_ = crs_matrix.num_rows(); - num_cols_ = crs_matrix.num_cols(); - num_nonzeros_ = crs_matrix.num_nonzeros(); + : num_rows_(crs_matrix.num_rows()), + num_cols_(crs_matrix.num_cols()), + num_nonzeros_(crs_matrix.num_nonzeros()), + context_(context), + rows_(context, num_rows_ + 1), + cols_(context, num_nonzeros_), + values_(context, num_nonzeros_), + spmv_buffer_(context) { rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1); cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_); values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_); - cusparseCreateCsr(&descr_, - num_rows_, - num_cols_, - num_nonzeros_, - rows_.data(), - cols_.data(), - values_.data(), - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_64F); + Initialize(); } CudaSparseMatrix::~CudaSparseMatrix() { CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS); descr_ = nullptr; + CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_left_)); + CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_right_)); } void CudaSparseMatrix::CopyValuesFromCpu( @@ -128,58 +150,76 @@ values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_); } +void CudaSparseMatrix::Initialize() { + CHECK(context_->IsCudaInitialized()); + CHECK_EQ(CUSPARSE_STATUS_SUCCESS, + cusparseCreateCsr(&descr_, + num_rows_, + num_cols_, + num_nonzeros_, + rows_.data(), + cols_.data(), + values_.data(), + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F)); + + // Note: values_.data() is used as non-zero pointer to device memory + // When there is no non-zero values, data-pointer of values_ array will be a + // nullptr; but in this case left/right products are trivial and temporary + // buffer (and vector descriptors) is not required + if (!num_nonzeros_) return; + + CHECK_EQ(CUSPARSE_STATUS_SUCCESS, + cusparseCreateDnVec( + &descr_vec_left_, num_rows_, values_.data(), CUDA_R_64F)); + CHECK_EQ(CUSPARSE_STATUS_SUCCESS, + cusparseCreateDnVec( + &descr_vec_right_, num_cols_, values_.data(), CUDA_R_64F)); + size_t buffer_size = GetTempBufferSize( + context_->cusparse_handle_, descr_vec_left_, descr_vec_right_, descr_); + spmv_buffer_.Reserve(buffer_size); +} + void CudaSparseMatrix::SpMv(cusparseOperation_t op, - const CudaVector& x, - CudaVector* y) { + const cusparseDnVecDescr_t& x, + const cusparseDnVecDescr_t& y) const { size_t buffer_size = 0; const double alpha = 1.0; const double beta = 1.0; - // Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of - // CUSPARSE_SPMV_ALG_DEFAULT. -#if CUDART_VERSION >= 11021 - const auto algorithm = CUSPARSE_SPMV_ALG_DEFAULT; -#else // CUDART_VERSION >= 11021 - const auto algorithm = CUSPARSE_MV_ALG_DEFAULT; -#endif // CUDART_VERSION >= 11021 - - CHECK_EQ(cusparseSpMV_bufferSize(context_->cusparse_handle_, - op, - &alpha, - descr_, - x.descr(), - &beta, - y->descr(), - CUDA_R_64F, - algorithm, - &buffer_size), - CUSPARSE_STATUS_SUCCESS); - spmv_buffer_.Reserve(buffer_size); CHECK_EQ(cusparseSpMV(context_->cusparse_handle_, op, &alpha, descr_, - x.descr(), + x, &beta, - y->descr(), + y, CUDA_R_64F, - algorithm, + kSpMVAlgorithm, spmv_buffer_.data()), CUSPARSE_STATUS_SUCCESS); } void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x, - CudaVector* y) { - SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x, y); + CudaVector* y) const { + DCHECK(GetTempBufferSize( + context_->cusparse_handle_, y->descr(), x.descr(), descr_) <= + spmv_buffer_.size()); + SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x.descr(), y->descr()); } void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x, - CudaVector* y) { + CudaVector* y) const { // TODO(Joydeep Biswas): We should consider storing a transposed copy of the // matrix by converting CSR to CSC. From the cuSPARSE documentation: // "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA // != CUSPARSE_OPERATION_NON_TRANSPOSE" - SpMv(CUSPARSE_OPERATION_TRANSPOSE, x, y); + DCHECK(GetTempBufferSize( + context_->cusparse_handle_, x.descr(), y->descr(), descr_) <= + spmv_buffer_.size()); + SpMv(CUSPARSE_OPERATION_TRANSPOSE, x.descr(), y->descr()); } } // namespace ceres::internal
diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h index f5fcb91..d53afd1 100644 --- a/internal/ceres/cuda_sparse_matrix.h +++ b/internal/ceres/cuda_sparse_matrix.h
@@ -56,27 +56,28 @@ // A sparse matrix hosted on the GPU in compressed row sparse format, with // CUDA-accelerated operations. +// The user of the class must ensure that ContextImpl::InitCuda() has already +// been successfully called before using this class. class CERES_NO_EXPORT CudaSparseMatrix { public: - // Create a GPU copy of the matrix provided. The caller must ensure that - // InitCuda() has already been successfully called on context before calling - // this constructor. + // Create a GPU copy of the matrix provided. CudaSparseMatrix(ContextImpl* context, const CompressedRowSparseMatrix& crs_matrix); - // Creates a "blank" matrix with an appropriate amount of memory allocated. - // The object itself is left in an inconsistent state. - CudaSparseMatrix(int num_rows, - int num_cols, - int num_nonzeros, + // Create matrix from existing row and column index buffers. + // Values are left uninitialized. + CudaSparseMatrix(int num_cols, + CudaBuffer<int32_t>&& rows, + CudaBuffer<int32_t>&& cols, ContextImpl* context); ~CudaSparseMatrix(); + // Left/right products are using internal buffer and are not thread-safe // y = y + Ax; - void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y); + void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const; // y = y + A'x; - void LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y); + void LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const; int num_rows() const { return num_rows_; } int num_cols() const { return num_cols_; } @@ -104,9 +105,15 @@ CudaSparseMatrix(const CudaSparseMatrix&) = delete; CudaSparseMatrix& operator=(const CudaSparseMatrix&) = delete; + // Allocate temporary buffer for left/right products, create cuSPARSE + // descriptors + void Initialize(); + // y = y + op(M)x. op must be either CUSPARSE_OPERATION_NON_TRANSPOSE or // CUSPARSE_OPERATION_TRANSPOSE. - void SpMv(cusparseOperation_t op, const CudaVector& x, CudaVector* y); + void SpMv(cusparseOperation_t op, + const cusparseDnVecDescr_t& x, + const cusparseDnVecDescr_t& y) const; int num_rows_ = 0; int num_cols_ = 0; @@ -123,7 +130,11 @@ // CuSparse object that describes this matrix. cusparseSpMatDescr_t descr_ = nullptr; - CudaBuffer<uint8_t> spmv_buffer_; + // Dense vector descriptors for pointer interface + cusparseDnVecDescr_t descr_vec_left_ = nullptr; + cusparseDnVecDescr_t descr_vec_right_ = nullptr; + + mutable CudaBuffer<uint8_t> spmv_buffer_; }; } // namespace ceres::internal
diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc index d434f36..b714aaf 100644 --- a/internal/ceres/cuda_vector.cc +++ b/internal/ceres/cuda_vector.cc
@@ -57,6 +57,15 @@ Resize(size); } +CudaVector::CudaVector(CudaVector&& other) + : num_rows_(other.num_rows_), + context_(other.context_), + data_(std::move(other.data_)), + descr_(other.descr_) { + other.num_rows_ = 0; + other.descr_ = nullptr; +} + CudaVector& CudaVector::operator=(const CudaVector& other) { if (this != &other) { Resize(other.num_rows()); @@ -88,7 +97,7 @@ num_rows_, data_.data(), 1, - x.data().data(), + x.data(), 1, &result), CUBLAS_STATUS_SUCCESS) @@ -105,13 +114,15 @@ return result; } +void CudaVector::CopyFromCpu(const double* x) { + data_.CopyFromCpu(x, num_rows_); +} + void CudaVector::CopyFromCpu(const Vector& x) { - data_.Reserve(x.rows()); - data_.CopyFromCpu(x.data(), x.rows()); - num_rows_ = x.rows(); - DestroyDescriptor(); - CHECK_EQ(cusparseCreateDnVec(&descr_, num_rows_, data_.data(), CUDA_R_64F), - CUSPARSE_STATUS_SUCCESS); + if (x.rows() != num_rows_) { + Resize(x.rows()); + } + CopyFromCpu(x.data()); } void CudaVector::CopyTo(Vector* x) const { @@ -126,6 +137,8 @@ } void CudaVector::SetZero() { + // Allow empty vector to be zeroed + if (num_rows_ == 0) return; CHECK(data_.data() != nullptr); CudaSetZeroFP64(data_.data(), num_rows_, context_->DefaultStream()); } @@ -147,7 +160,7 @@ CHECK_EQ(cublasDaxpy(context_->cublas_handle_, num_rows_, &a, - x.data().data(), + x.data(), 1, data_.data(), 1), @@ -156,11 +169,8 @@ } void CudaVector::DtDxpy(const CudaVector& D, const CudaVector& x) { - CudaDtDxpy(data_.data(), - D.data().data(), - x.data().data(), - num_rows_, - context_->DefaultStream()); + CudaDtDxpy( + data_.data(), D.data(), x.data(), num_rows_, context_->DefaultStream()); } void CudaVector::Scale(double s) {
diff --git a/internal/ceres/cuda_vector.h b/internal/ceres/cuda_vector.h index 46661cf..cbf3b27 100644 --- a/internal/ceres/cuda_vector.h +++ b/internal/ceres/cuda_vector.h
@@ -65,6 +65,8 @@ // context before calling this method. CudaVector(ContextImpl* context, int size); + CudaVector(CudaVector&& other); + ~CudaVector(); void Resize(int size); @@ -84,6 +86,9 @@ // Copy from Eigen vector. void CopyFromCpu(const Vector& x); + // Copy from CPU memory array. + void CopyFromCpu(const double* x); + // Copy to Eigen vector. void CopyTo(Vector* x) const; @@ -103,7 +108,8 @@ int num_rows() const { return num_rows_; } int num_cols() const { return 1; } - const CudaBuffer<double>& data() const { return data_; } + const double* data() const { return data_.data(); } + double* mutable_data() { return data_.data(); } const cusparseDnVecDescr_t& descr() const { return descr_; }
diff --git a/internal/ceres/cuda_vector_test.cc b/internal/ceres/cuda_vector_test.cc index db1fec5..fd7c166 100644 --- a/internal/ceres/cuda_vector_test.cc +++ b/internal/ceres/cuda_vector_test.cc
@@ -48,7 +48,7 @@ CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message; CudaVector x(&context, 1000); EXPECT_EQ(x.num_rows(), 1000); - EXPECT_NE(x.data().data(), nullptr); + EXPECT_NE(x.data(), nullptr); } TEST(CudaVector, CopyVector) { @@ -67,6 +67,23 @@ EXPECT_EQ(x, z); } +TEST(CudaVector, Move) { + ContextImpl context; + std::string message; + CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message; + CudaVector y(&context, 10); + const auto y_data = y.data(); + const auto y_descr = y.descr(); + EXPECT_EQ(y.num_rows(), 10); + CudaVector z(std::move(y)); + EXPECT_EQ(y.data(), nullptr); + EXPECT_EQ(y.descr(), nullptr); + EXPECT_EQ(y.num_rows(), 0); + + EXPECT_EQ(z.data(), y_data); + EXPECT_EQ(z.descr(), y_descr); +} + TEST(CudaVector, DeepCopy) { Vector x(3); x << 1, 2, 3;
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc index 28a8aff..46ebe13 100644 --- a/internal/ceres/evaluation_benchmark.cc +++ b/internal/ceres/evaluation_benchmark.cc
@@ -496,7 +496,7 @@ cuda_x.CopyFromCpu(x); cuda_y.SetZero(); - auto matrix = view.mutable_matrix_f(); + auto matrix = view.matrix_f(); for (auto _ : state) { matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y); } @@ -522,7 +522,7 @@ cuda_x.CopyFromCpu(x); cuda_y.SetZero(); - auto matrix = view.mutable_matrix_f(); + auto matrix = view.matrix_f(); for (auto _ : state) { matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y); } @@ -548,7 +548,7 @@ cuda_x.CopyFromCpu(x); cuda_y.SetZero(); - auto matrix = view.mutable_matrix_e(); + auto matrix = view.matrix_e(); for (auto _ : state) { matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y); } @@ -574,7 +574,7 @@ cuda_x.CopyFromCpu(x); cuda_y.SetZero(); - auto matrix = view.mutable_matrix_e(); + auto matrix = view.matrix_e(); for (auto _ : state) { matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y); }