Update CudaSparseMatrix class
- Perform temporary buffer size estimation only once
- Allow construction from existing buffers with col/row structure
Change-Id: I73c291328f1e8ed9184aba5d7058df71cbc6a15d
diff --git a/internal/ceres/cuda_block_sparse_crs_view.cc b/internal/ceres/cuda_block_sparse_crs_view.cc
index 6d4c6b0..f0da35f 100644
--- a/internal/ceres/cuda_block_sparse_crs_view.cc
+++ b/internal/ceres/cuda_block_sparse_crs_view.cc
@@ -41,16 +41,16 @@
: context_(context) {
block_structure_ = std::make_unique<CudaBlockSparseStructure>(
*bsm.block_structure(), context);
- crs_matrix_ = std::make_unique<CudaSparseMatrix>(
- bsm.num_rows(), bsm.num_cols(), bsm.num_nonzeros(), context);
+ CudaBuffer<int32_t> rows(context, bsm.num_rows() + 1);
+ CudaBuffer<int32_t> cols(context, bsm.num_nonzeros());
FillCRSStructure(block_structure_->num_row_blocks(),
bsm.num_rows(),
block_structure_->first_cell_in_row_block(),
block_structure_->cells(),
block_structure_->row_blocks(),
block_structure_->col_blocks(),
- crs_matrix_->mutable_rows(),
- crs_matrix_->mutable_cols(),
+ rows.data(),
+ cols.data(),
context->DefaultStream());
is_crs_compatible_ = block_structure_->IsCrsCompatible();
// if matrix is crs-compatible - we can drop block-structure and don't need
@@ -63,6 +63,8 @@
streamed_buffer_ = std::make_unique<CudaStreamedBuffer<double>>(
context_, kMaxTemporaryArraySize);
}
+ crs_matrix_ = std::make_unique<CudaSparseMatrix>(
+ bsm.num_cols(), std::move(rows), std::move(cols), context);
UpdateValues(bsm);
}
diff --git a/internal/ceres/cuda_block_sparse_crs_view.h b/internal/ceres/cuda_block_sparse_crs_view.h
index 2ae8721..58ef618 100644
--- a/internal/ceres/cuda_block_sparse_crs_view.h
+++ b/internal/ceres/cuda_block_sparse_crs_view.h
@@ -80,6 +80,14 @@
// Returns true if block-sparse matrix had CRS-compatible value layout
bool IsCrsCompatible() const { return is_crs_compatible_; }
+ void LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const {
+ crs_matrix()->LeftMultiplyAndAccumulate(x, y);
+ }
+
+ void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const {
+ crs_matrix()->RightMultiplyAndAccumulate(x, y);
+ }
+
private:
// Value permutation kernel performs a single element-wise operation per
// thread, thus performing permutation in blocks of 8 megabytes of
diff --git a/internal/ceres/cuda_block_sparse_crs_view_test.cc b/internal/ceres/cuda_block_sparse_crs_view_test.cc
index dd49ee8..07e8513 100644
--- a/internal/ceres/cuda_block_sparse_crs_view_test.cc
+++ b/internal/ceres/cuda_block_sparse_crs_view_test.cc
@@ -99,7 +99,7 @@
1);
}
- void Compare(const BlockSparseMatrix& bsm, CudaSparseMatrix& csm) {
+ void Compare(const BlockSparseMatrix& bsm, const CudaSparseMatrix& csm) {
ASSERT_EQ(csm.num_cols(), bsm.num_cols());
ASSERT_EQ(csm.num_rows(), bsm.num_rows());
ASSERT_EQ(csm.num_nonzeros(), bsm.num_nonzeros());
@@ -138,7 +138,7 @@
CudaBlockSparseCRSView(*block_sparse_non_crs_compatible_, &context_);
ASSERT_EQ(view.IsCrsCompatible(), false);
- auto matrix = view.mutable_crs_matrix();
+ auto matrix = view.crs_matrix();
Compare(*block_sparse_non_crs_compatible_, *matrix);
}
@@ -147,7 +147,7 @@
CudaBlockSparseCRSView(*block_sparse_crs_compatible_rows_, &context_);
ASSERT_EQ(view.IsCrsCompatible(), true);
- auto matrix = view.mutable_crs_matrix();
+ auto matrix = view.crs_matrix();
Compare(*block_sparse_crs_compatible_rows_, *matrix);
}
@@ -156,7 +156,7 @@
&context_);
ASSERT_EQ(view.IsCrsCompatible(), true);
- auto matrix = view.mutable_crs_matrix();
+ auto matrix = view.crs_matrix();
Compare(*block_sparse_crs_compatible_single_cell_, *matrix);
}
} // namespace ceres::internal
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h
index d6abb15..98cab5d 100644
--- a/internal/ceres/cuda_buffer.h
+++ b/internal/ceres/cuda_buffer.h
@@ -55,6 +55,13 @@
CudaBuffer(ContextImpl* context, int size) : context_(context) {
Reserve(size);
}
+
+ CudaBuffer(CudaBuffer&& other)
+ : data_(other.data_), size_(other.size_), context_(other.context_) {
+ other.data_ = nullptr;
+ other.size_ = 0;
+ }
+
CudaBuffer(const CudaBuffer&) = delete;
CudaBuffer& operator=(const CudaBuffer&) = delete;
@@ -162,4 +169,4 @@
#endif // CERES_NO_CUDA
-#endif // CERES_INTERNAL_CUDA_BUFFER_H_
\ No newline at end of file
+#endif // CERES_INTERNAL_CUDA_BUFFER_H_
diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
index 703be84..9153544 100644
--- a/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
+++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
@@ -59,10 +59,11 @@
? bs.cols[num_col_blocks_e].position
: bsm.num_cols();
const int num_cols_f = bsm.num_cols() - num_cols_e;
- matrix_e_ = std::make_unique<CudaSparseMatrix>(
- num_rows, num_cols_e, num_nonzeros_e, context);
- matrix_f_ = std::make_unique<CudaSparseMatrix>(
- num_rows, num_cols_f, num_nonzeros_f, context);
+
+ CudaBuffer<int32_t> rows_e(context, num_rows + 1);
+ CudaBuffer<int32_t> cols_e(context, num_nonzeros_e);
+ CudaBuffer<int32_t> rows_f(context, num_rows + 1);
+ CudaBuffer<int32_t> cols_f(context, num_nonzeros_f);
num_row_blocks_e_ = block_structure_->num_row_blocks_e();
FillCRSStructurePartitioned(block_structure_->num_row_blocks(),
@@ -74,10 +75,10 @@
block_structure_->cells(),
block_structure_->row_blocks(),
block_structure_->col_blocks(),
- matrix_e_->mutable_rows(),
- matrix_e_->mutable_cols(),
- matrix_f_->mutable_rows(),
- matrix_f_->mutable_cols(),
+ rows_e.data(),
+ cols_e.data(),
+ rows_f.data(),
+ cols_f.data(),
context->DefaultStream());
f_is_crs_compatible_ = block_structure_->IsCrsCompatible();
if (f_is_crs_compatible_) {
@@ -86,6 +87,11 @@
streamed_buffer_ = std::make_unique<CudaStreamedBuffer<double>>(
context, kMaxTemporaryArraySize);
}
+ matrix_e_ = std::make_unique<CudaSparseMatrix>(
+ num_cols_e, std::move(rows_e), std::move(cols_e), context);
+ matrix_f_ = std::make_unique<CudaSparseMatrix>(
+ num_cols_f, std::move(rows_f), std::move(cols_f), context);
+
CHECK_EQ(bsm.num_nonzeros(),
matrix_e_->num_nonzeros() + matrix_f_->num_nonzeros());
diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc b/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc
index 5090b6a..ddfdeef 100644
--- a/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc
+++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc
@@ -196,11 +196,8 @@
const int num_cols_e = bs.cols[num_col_blocks_e].position;
const int num_cols_f = num_cols - num_cols_e;
- // TODO: we definitely would like to use matrix() here, but
- // CudaSparseMatrix::RightMultiplyAndAccumulate is defined non-const because
- // it might allocate additional storage by request of cuSPARSE
- auto matrix_e = view.mutable_matrix_e();
- auto matrix_f = view.mutable_matrix_f();
+ auto matrix_e = view.matrix_e();
+ auto matrix_f = view.matrix_f();
ASSERT_EQ(matrix_e->num_cols(), num_cols_e);
ASSERT_EQ(matrix_e->num_rows(), num_rows);
ASSERT_EQ(matrix_f->num_cols(), num_cols_f);
diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc
index 905b4ab..2d6f2f8 100644
--- a/internal/ceres/cuda_sparse_matrix.cc
+++ b/internal/ceres/cuda_sparse_matrix.cc
@@ -58,63 +58,85 @@
#include "cusparse.h"
namespace ceres::internal {
+namespace {
+// Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of
+// CUSPARSE_SPMV_ALG_DEFAULT.
+#if CUDART_VERSION >= 11021
+const auto kSpMVAlgorithm = CUSPARSE_SPMV_ALG_DEFAULT;
+#else // CUDART_VERSION >= 11021
+const auto kSpMVAlgorithm = CUSPARSE_MV_ALG_DEFAULT;
+#endif // CUDART_VERSION >= 11021
+size_t GetTempBufferSizeForOp(const cusparseHandle_t& handle,
+ const cusparseOperation_t op,
+ const cusparseDnVecDescr_t& x,
+ const cusparseDnVecDescr_t& y,
+ const cusparseSpMatDescr_t& A) {
+ size_t buffer_size;
+ const double alpha = 1.0;
+ const double beta = 1.0;
+ CHECK_NE(A, nullptr);
+ CHECK_EQ(cusparseSpMV_bufferSize(handle,
+ op,
+ &alpha,
+ A,
+ x,
+ &beta,
+ y,
+ CUDA_R_64F,
+ kSpMVAlgorithm,
+ &buffer_size),
+ CUSPARSE_STATUS_SUCCESS);
+ return buffer_size;
+}
-CudaSparseMatrix::CudaSparseMatrix(int num_rows,
- int num_cols,
- int num_nonzeros,
+size_t GetTempBufferSize(const cusparseHandle_t& handle,
+ const cusparseDnVecDescr_t& left,
+ const cusparseDnVecDescr_t& right,
+ const cusparseSpMatDescr_t& A) {
+ CHECK_NE(A, nullptr);
+ return std::max(GetTempBufferSizeForOp(
+ handle, CUSPARSE_OPERATION_NON_TRANSPOSE, right, left, A),
+ GetTempBufferSizeForOp(
+ handle, CUSPARSE_OPERATION_TRANSPOSE, left, right, A));
+}
+} // namespace
+
+CudaSparseMatrix::CudaSparseMatrix(int num_cols,
+ CudaBuffer<int32_t>&& rows,
+ CudaBuffer<int32_t>&& cols,
ContextImpl* context)
- : num_rows_(num_rows),
+ : num_rows_(rows.size() - 1),
num_cols_(num_cols),
- num_nonzeros_(num_nonzeros),
+ num_nonzeros_(cols.size()),
context_(context),
- rows_(context, num_rows + 1),
- cols_(context, num_nonzeros),
- values_(context, num_nonzeros),
+ rows_(std::move(rows)),
+ cols_(std::move(cols)),
+ values_(context, num_nonzeros_),
spmv_buffer_(context) {
- cusparseCreateCsr(&descr_,
- num_rows_,
- num_cols_,
- num_nonzeros_,
- rows_.data(),
- cols_.data(),
- values_.data(),
- CUSPARSE_INDEX_32I,
- CUSPARSE_INDEX_32I,
- CUSPARSE_INDEX_BASE_ZERO,
- CUDA_R_64F);
+ Initialize();
}
CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context,
const CompressedRowSparseMatrix& crs_matrix)
- : context_(context),
- rows_{context},
- cols_{context},
- values_{context},
- spmv_buffer_{context} {
- DCHECK_NE(context, nullptr);
- CHECK(context->IsCudaInitialized());
- num_rows_ = crs_matrix.num_rows();
- num_cols_ = crs_matrix.num_cols();
- num_nonzeros_ = crs_matrix.num_nonzeros();
+ : num_rows_(crs_matrix.num_rows()),
+ num_cols_(crs_matrix.num_cols()),
+ num_nonzeros_(crs_matrix.num_nonzeros()),
+ context_(context),
+ rows_(context, num_rows_ + 1),
+ cols_(context, num_nonzeros_),
+ values_(context, num_nonzeros_),
+ spmv_buffer_(context) {
rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1);
cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_);
values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
- cusparseCreateCsr(&descr_,
- num_rows_,
- num_cols_,
- num_nonzeros_,
- rows_.data(),
- cols_.data(),
- values_.data(),
- CUSPARSE_INDEX_32I,
- CUSPARSE_INDEX_32I,
- CUSPARSE_INDEX_BASE_ZERO,
- CUDA_R_64F);
+ Initialize();
}
CudaSparseMatrix::~CudaSparseMatrix() {
CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS);
descr_ = nullptr;
+ CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_left_));
+ CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_right_));
}
void CudaSparseMatrix::CopyValuesFromCpu(
@@ -128,58 +150,76 @@
values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
}
+void CudaSparseMatrix::Initialize() {
+ CHECK(context_->IsCudaInitialized());
+ CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
+ cusparseCreateCsr(&descr_,
+ num_rows_,
+ num_cols_,
+ num_nonzeros_,
+ rows_.data(),
+ cols_.data(),
+ values_.data(),
+ CUSPARSE_INDEX_32I,
+ CUSPARSE_INDEX_32I,
+ CUSPARSE_INDEX_BASE_ZERO,
+ CUDA_R_64F));
+
+ // Note: values_.data() is used as non-zero pointer to device memory
+ // When there is no non-zero values, data-pointer of values_ array will be a
+ // nullptr; but in this case left/right products are trivial and temporary
+ // buffer (and vector descriptors) is not required
+ if (!num_nonzeros_) return;
+
+ CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
+ cusparseCreateDnVec(
+ &descr_vec_left_, num_rows_, values_.data(), CUDA_R_64F));
+ CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
+ cusparseCreateDnVec(
+ &descr_vec_right_, num_cols_, values_.data(), CUDA_R_64F));
+ size_t buffer_size = GetTempBufferSize(
+ context_->cusparse_handle_, descr_vec_left_, descr_vec_right_, descr_);
+ spmv_buffer_.Reserve(buffer_size);
+}
+
void CudaSparseMatrix::SpMv(cusparseOperation_t op,
- const CudaVector& x,
- CudaVector* y) {
+ const cusparseDnVecDescr_t& x,
+ const cusparseDnVecDescr_t& y) const {
size_t buffer_size = 0;
const double alpha = 1.0;
const double beta = 1.0;
- // Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of
- // CUSPARSE_SPMV_ALG_DEFAULT.
-#if CUDART_VERSION >= 11021
- const auto algorithm = CUSPARSE_SPMV_ALG_DEFAULT;
-#else // CUDART_VERSION >= 11021
- const auto algorithm = CUSPARSE_MV_ALG_DEFAULT;
-#endif // CUDART_VERSION >= 11021
-
- CHECK_EQ(cusparseSpMV_bufferSize(context_->cusparse_handle_,
- op,
- &alpha,
- descr_,
- x.descr(),
- &beta,
- y->descr(),
- CUDA_R_64F,
- algorithm,
- &buffer_size),
- CUSPARSE_STATUS_SUCCESS);
- spmv_buffer_.Reserve(buffer_size);
CHECK_EQ(cusparseSpMV(context_->cusparse_handle_,
op,
&alpha,
descr_,
- x.descr(),
+ x,
&beta,
- y->descr(),
+ y,
CUDA_R_64F,
- algorithm,
+ kSpMVAlgorithm,
spmv_buffer_.data()),
CUSPARSE_STATUS_SUCCESS);
}
void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x,
- CudaVector* y) {
- SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x, y);
+ CudaVector* y) const {
+ DCHECK(GetTempBufferSize(
+ context_->cusparse_handle_, y->descr(), x.descr(), descr_) <=
+ spmv_buffer_.size());
+ SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x.descr(), y->descr());
}
void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x,
- CudaVector* y) {
+ CudaVector* y) const {
// TODO(Joydeep Biswas): We should consider storing a transposed copy of the
// matrix by converting CSR to CSC. From the cuSPARSE documentation:
// "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA
// != CUSPARSE_OPERATION_NON_TRANSPOSE"
- SpMv(CUSPARSE_OPERATION_TRANSPOSE, x, y);
+ DCHECK(GetTempBufferSize(
+ context_->cusparse_handle_, x.descr(), y->descr(), descr_) <=
+ spmv_buffer_.size());
+ SpMv(CUSPARSE_OPERATION_TRANSPOSE, x.descr(), y->descr());
}
} // namespace ceres::internal
diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h
index f5fcb91..d53afd1 100644
--- a/internal/ceres/cuda_sparse_matrix.h
+++ b/internal/ceres/cuda_sparse_matrix.h
@@ -56,27 +56,28 @@
// A sparse matrix hosted on the GPU in compressed row sparse format, with
// CUDA-accelerated operations.
+// The user of the class must ensure that ContextImpl::InitCuda() has already
+// been successfully called before using this class.
class CERES_NO_EXPORT CudaSparseMatrix {
public:
- // Create a GPU copy of the matrix provided. The caller must ensure that
- // InitCuda() has already been successfully called on context before calling
- // this constructor.
+ // Create a GPU copy of the matrix provided.
CudaSparseMatrix(ContextImpl* context,
const CompressedRowSparseMatrix& crs_matrix);
- // Creates a "blank" matrix with an appropriate amount of memory allocated.
- // The object itself is left in an inconsistent state.
- CudaSparseMatrix(int num_rows,
- int num_cols,
- int num_nonzeros,
+ // Create matrix from existing row and column index buffers.
+ // Values are left uninitialized.
+ CudaSparseMatrix(int num_cols,
+ CudaBuffer<int32_t>&& rows,
+ CudaBuffer<int32_t>&& cols,
ContextImpl* context);
~CudaSparseMatrix();
+ // Left/right products are using internal buffer and are not thread-safe
// y = y + Ax;
- void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y);
+ void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const;
// y = y + A'x;
- void LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y);
+ void LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const;
int num_rows() const { return num_rows_; }
int num_cols() const { return num_cols_; }
@@ -104,9 +105,15 @@
CudaSparseMatrix(const CudaSparseMatrix&) = delete;
CudaSparseMatrix& operator=(const CudaSparseMatrix&) = delete;
+ // Allocate temporary buffer for left/right products, create cuSPARSE
+ // descriptors
+ void Initialize();
+
// y = y + op(M)x. op must be either CUSPARSE_OPERATION_NON_TRANSPOSE or
// CUSPARSE_OPERATION_TRANSPOSE.
- void SpMv(cusparseOperation_t op, const CudaVector& x, CudaVector* y);
+ void SpMv(cusparseOperation_t op,
+ const cusparseDnVecDescr_t& x,
+ const cusparseDnVecDescr_t& y) const;
int num_rows_ = 0;
int num_cols_ = 0;
@@ -123,7 +130,11 @@
// CuSparse object that describes this matrix.
cusparseSpMatDescr_t descr_ = nullptr;
- CudaBuffer<uint8_t> spmv_buffer_;
+ // Dense vector descriptors for pointer interface
+ cusparseDnVecDescr_t descr_vec_left_ = nullptr;
+ cusparseDnVecDescr_t descr_vec_right_ = nullptr;
+
+ mutable CudaBuffer<uint8_t> spmv_buffer_;
};
} // namespace ceres::internal
diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc
index d434f36..b714aaf 100644
--- a/internal/ceres/cuda_vector.cc
+++ b/internal/ceres/cuda_vector.cc
@@ -57,6 +57,15 @@
Resize(size);
}
+CudaVector::CudaVector(CudaVector&& other)
+ : num_rows_(other.num_rows_),
+ context_(other.context_),
+ data_(std::move(other.data_)),
+ descr_(other.descr_) {
+ other.num_rows_ = 0;
+ other.descr_ = nullptr;
+}
+
CudaVector& CudaVector::operator=(const CudaVector& other) {
if (this != &other) {
Resize(other.num_rows());
@@ -88,7 +97,7 @@
num_rows_,
data_.data(),
1,
- x.data().data(),
+ x.data(),
1,
&result),
CUBLAS_STATUS_SUCCESS)
@@ -105,13 +114,15 @@
return result;
}
+void CudaVector::CopyFromCpu(const double* x) {
+ data_.CopyFromCpu(x, num_rows_);
+}
+
void CudaVector::CopyFromCpu(const Vector& x) {
- data_.Reserve(x.rows());
- data_.CopyFromCpu(x.data(), x.rows());
- num_rows_ = x.rows();
- DestroyDescriptor();
- CHECK_EQ(cusparseCreateDnVec(&descr_, num_rows_, data_.data(), CUDA_R_64F),
- CUSPARSE_STATUS_SUCCESS);
+ if (x.rows() != num_rows_) {
+ Resize(x.rows());
+ }
+ CopyFromCpu(x.data());
}
void CudaVector::CopyTo(Vector* x) const {
@@ -126,6 +137,8 @@
}
void CudaVector::SetZero() {
+ // Allow empty vector to be zeroed
+ if (num_rows_ == 0) return;
CHECK(data_.data() != nullptr);
CudaSetZeroFP64(data_.data(), num_rows_, context_->DefaultStream());
}
@@ -147,7 +160,7 @@
CHECK_EQ(cublasDaxpy(context_->cublas_handle_,
num_rows_,
&a,
- x.data().data(),
+ x.data(),
1,
data_.data(),
1),
@@ -156,11 +169,8 @@
}
void CudaVector::DtDxpy(const CudaVector& D, const CudaVector& x) {
- CudaDtDxpy(data_.data(),
- D.data().data(),
- x.data().data(),
- num_rows_,
- context_->DefaultStream());
+ CudaDtDxpy(
+ data_.data(), D.data(), x.data(), num_rows_, context_->DefaultStream());
}
void CudaVector::Scale(double s) {
diff --git a/internal/ceres/cuda_vector.h b/internal/ceres/cuda_vector.h
index 46661cf..cbf3b27 100644
--- a/internal/ceres/cuda_vector.h
+++ b/internal/ceres/cuda_vector.h
@@ -65,6 +65,8 @@
// context before calling this method.
CudaVector(ContextImpl* context, int size);
+ CudaVector(CudaVector&& other);
+
~CudaVector();
void Resize(int size);
@@ -84,6 +86,9 @@
// Copy from Eigen vector.
void CopyFromCpu(const Vector& x);
+ // Copy from CPU memory array.
+ void CopyFromCpu(const double* x);
+
// Copy to Eigen vector.
void CopyTo(Vector* x) const;
@@ -103,7 +108,8 @@
int num_rows() const { return num_rows_; }
int num_cols() const { return 1; }
- const CudaBuffer<double>& data() const { return data_; }
+ const double* data() const { return data_.data(); }
+ double* mutable_data() { return data_.data(); }
const cusparseDnVecDescr_t& descr() const { return descr_; }
diff --git a/internal/ceres/cuda_vector_test.cc b/internal/ceres/cuda_vector_test.cc
index db1fec5..fd7c166 100644
--- a/internal/ceres/cuda_vector_test.cc
+++ b/internal/ceres/cuda_vector_test.cc
@@ -48,7 +48,7 @@
CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
CudaVector x(&context, 1000);
EXPECT_EQ(x.num_rows(), 1000);
- EXPECT_NE(x.data().data(), nullptr);
+ EXPECT_NE(x.data(), nullptr);
}
TEST(CudaVector, CopyVector) {
@@ -67,6 +67,23 @@
EXPECT_EQ(x, z);
}
+TEST(CudaVector, Move) {
+ ContextImpl context;
+ std::string message;
+ CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+ CudaVector y(&context, 10);
+ const auto y_data = y.data();
+ const auto y_descr = y.descr();
+ EXPECT_EQ(y.num_rows(), 10);
+ CudaVector z(std::move(y));
+ EXPECT_EQ(y.data(), nullptr);
+ EXPECT_EQ(y.descr(), nullptr);
+ EXPECT_EQ(y.num_rows(), 0);
+
+ EXPECT_EQ(z.data(), y_data);
+ EXPECT_EQ(z.descr(), y_descr);
+}
+
TEST(CudaVector, DeepCopy) {
Vector x(3);
x << 1, 2, 3;
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index 28a8aff..46ebe13 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc
@@ -496,7 +496,7 @@
cuda_x.CopyFromCpu(x);
cuda_y.SetZero();
- auto matrix = view.mutable_matrix_f();
+ auto matrix = view.matrix_f();
for (auto _ : state) {
matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
}
@@ -522,7 +522,7 @@
cuda_x.CopyFromCpu(x);
cuda_y.SetZero();
- auto matrix = view.mutable_matrix_f();
+ auto matrix = view.matrix_f();
for (auto _ : state) {
matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
}
@@ -548,7 +548,7 @@
cuda_x.CopyFromCpu(x);
cuda_y.SetZero();
- auto matrix = view.mutable_matrix_e();
+ auto matrix = view.matrix_e();
for (auto _ : state) {
matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
}
@@ -574,7 +574,7 @@
cuda_x.CopyFromCpu(x);
cuda_y.SetZero();
- auto matrix = view.mutable_matrix_e();
+ auto matrix = view.matrix_e();
for (auto _ : state) {
matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
}