|  | // Ceres Solver - A fast non-linear least squares minimizer | 
|  | // Copyright 2023 Google Inc. All rights reserved. | 
|  | // http://ceres-solver.org/ | 
|  | // | 
|  | // Redistribution and use in source and binary forms, with or without | 
|  | // modification, are permitted provided that the following conditions are met: | 
|  | // | 
|  | // * Redistributions of source code must retain the above copyright notice, | 
|  | //   this list of conditions and the following disclaimer. | 
|  | // * Redistributions in binary form must reproduce the above copyright notice, | 
|  | //   this list of conditions and the following disclaimer in the documentation | 
|  | //   and/or other materials provided with the distribution. | 
|  | // * Neither the name of Google Inc. nor the names of its contributors may be | 
|  | //   used to endorse or promote products derived from this software without | 
|  | //   specific prior written permission. | 
|  | // | 
|  | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
|  | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
|  | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
|  | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
|  | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
|  | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
|  | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
|  | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
|  | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
|  | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
|  | // POSSIBILITY OF SUCH DAMAGE. | 
|  | // | 
|  | // Author: joydeepb@cs.utexas.edu (Joydeep Biswas) | 
|  | // | 
|  | // A CUDA sparse matrix linear operator. | 
|  |  | 
|  | // This include must come before any #ifndef check on Ceres compile options. | 
|  | // clang-format off | 
|  | #include "ceres/internal/config.h" | 
|  | // clang-format on | 
|  |  | 
|  | #include "ceres/cuda_sparse_matrix.h" | 
|  |  | 
|  | #include <math.h> | 
|  |  | 
|  | #include <memory> | 
|  |  | 
|  | #include "ceres/block_sparse_matrix.h" | 
|  | #include "ceres/compressed_row_sparse_matrix.h" | 
|  | #include "ceres/context_impl.h" | 
|  | #include "ceres/crs_matrix.h" | 
|  | #include "ceres/internal/export.h" | 
|  | #include "ceres/types.h" | 
|  | #include "ceres/wall_time.h" | 
|  |  | 
|  | #ifndef CERES_NO_CUDA | 
|  |  | 
|  | #include "ceres/cuda_buffer.h" | 
|  | #include "ceres/cuda_kernels_vector_ops.h" | 
|  | #include "ceres/cuda_vector.h" | 
|  | #include "cuda_runtime_api.h" | 
|  | #include "cusparse.h" | 
|  |  | 
|  | namespace ceres::internal { | 
|  | namespace { | 
|  | // Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of | 
|  | // CUSPARSE_SPMV_ALG_DEFAULT. | 
|  | #if CUDART_VERSION >= 11021 | 
|  | const auto kSpMVAlgorithm = CUSPARSE_SPMV_ALG_DEFAULT; | 
|  | #else   // CUDART_VERSION >= 11021 | 
|  | const auto kSpMVAlgorithm = CUSPARSE_MV_ALG_DEFAULT; | 
|  | #endif  // CUDART_VERSION >= 11021 | 
|  | size_t GetTempBufferSizeForOp(const cusparseHandle_t& handle, | 
|  | const cusparseOperation_t op, | 
|  | const cusparseDnVecDescr_t& x, | 
|  | const cusparseDnVecDescr_t& y, | 
|  | const cusparseSpMatDescr_t& A) { | 
|  | size_t buffer_size; | 
|  | const double alpha = 1.0; | 
|  | const double beta = 1.0; | 
|  | CHECK_NE(A, nullptr); | 
|  | CHECK_EQ(cusparseSpMV_bufferSize(handle, | 
|  | op, | 
|  | &alpha, | 
|  | A, | 
|  | x, | 
|  | &beta, | 
|  | y, | 
|  | CUDA_R_64F, | 
|  | kSpMVAlgorithm, | 
|  | &buffer_size), | 
|  | CUSPARSE_STATUS_SUCCESS); | 
|  | return buffer_size; | 
|  | } | 
|  |  | 
|  | size_t GetTempBufferSize(const cusparseHandle_t& handle, | 
|  | const cusparseDnVecDescr_t& left, | 
|  | const cusparseDnVecDescr_t& right, | 
|  | const cusparseSpMatDescr_t& A) { | 
|  | CHECK_NE(A, nullptr); | 
|  | return std::max(GetTempBufferSizeForOp( | 
|  | handle, CUSPARSE_OPERATION_NON_TRANSPOSE, right, left, A), | 
|  | GetTempBufferSizeForOp( | 
|  | handle, CUSPARSE_OPERATION_TRANSPOSE, left, right, A)); | 
|  | } | 
|  | }  // namespace | 
|  |  | 
|  | CudaSparseMatrix::CudaSparseMatrix(int num_cols, | 
|  | CudaBuffer<int32_t>&& rows, | 
|  | CudaBuffer<int32_t>&& cols, | 
|  | ContextImpl* context) | 
|  | : num_rows_(rows.size() - 1), | 
|  | num_cols_(num_cols), | 
|  | num_nonzeros_(cols.size()), | 
|  | context_(context), | 
|  | rows_(std::move(rows)), | 
|  | cols_(std::move(cols)), | 
|  | values_(context, num_nonzeros_), | 
|  | spmv_buffer_(context) { | 
|  | Initialize(); | 
|  | } | 
|  |  | 
|  | CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context, | 
|  | const CompressedRowSparseMatrix& crs_matrix) | 
|  | : num_rows_(crs_matrix.num_rows()), | 
|  | num_cols_(crs_matrix.num_cols()), | 
|  | num_nonzeros_(crs_matrix.num_nonzeros()), | 
|  | context_(context), | 
|  | rows_(context, num_rows_ + 1), | 
|  | cols_(context, num_nonzeros_), | 
|  | values_(context, num_nonzeros_), | 
|  | spmv_buffer_(context) { | 
|  | rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1); | 
|  | cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_); | 
|  | values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_); | 
|  | Initialize(); | 
|  | } | 
|  |  | 
|  | CudaSparseMatrix::~CudaSparseMatrix() { | 
|  | CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS); | 
|  | descr_ = nullptr; | 
|  | CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_left_)); | 
|  | CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_right_)); | 
|  | } | 
|  |  | 
|  | void CudaSparseMatrix::CopyValuesFromCpu( | 
|  | const CompressedRowSparseMatrix& crs_matrix) { | 
|  | // There is no quick and easy way to verify that the structure is unchanged, | 
|  | // but at least we can check that the size of the matrix and the number of | 
|  | // nonzeros is unchanged. | 
|  | CHECK_EQ(num_rows_, crs_matrix.num_rows()); | 
|  | CHECK_EQ(num_cols_, crs_matrix.num_cols()); | 
|  | CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros()); | 
|  | values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_); | 
|  | } | 
|  |  | 
|  | void CudaSparseMatrix::Initialize() { | 
|  | CHECK(context_->IsCudaInitialized()); | 
|  | CHECK_EQ(CUSPARSE_STATUS_SUCCESS, | 
|  | cusparseCreateCsr(&descr_, | 
|  | num_rows_, | 
|  | num_cols_, | 
|  | num_nonzeros_, | 
|  | rows_.data(), | 
|  | cols_.data(), | 
|  | values_.data(), | 
|  | CUSPARSE_INDEX_32I, | 
|  | CUSPARSE_INDEX_32I, | 
|  | CUSPARSE_INDEX_BASE_ZERO, | 
|  | CUDA_R_64F)); | 
|  |  | 
|  | // Note: values_.data() is used as non-zero pointer to device memory | 
|  | // When there is no non-zero values, data-pointer of values_ array will be a | 
|  | // nullptr; but in this case left/right products are trivial and temporary | 
|  | // buffer (and vector descriptors) is not required | 
|  | if (!num_nonzeros_) return; | 
|  |  | 
|  | CHECK_EQ(CUSPARSE_STATUS_SUCCESS, | 
|  | cusparseCreateDnVec( | 
|  | &descr_vec_left_, num_rows_, values_.data(), CUDA_R_64F)); | 
|  | CHECK_EQ(CUSPARSE_STATUS_SUCCESS, | 
|  | cusparseCreateDnVec( | 
|  | &descr_vec_right_, num_cols_, values_.data(), CUDA_R_64F)); | 
|  | size_t buffer_size = GetTempBufferSize( | 
|  | context_->cusparse_handle_, descr_vec_left_, descr_vec_right_, descr_); | 
|  | spmv_buffer_.Reserve(buffer_size); | 
|  | } | 
|  |  | 
|  | void CudaSparseMatrix::SpMv(cusparseOperation_t op, | 
|  | const cusparseDnVecDescr_t& x, | 
|  | const cusparseDnVecDescr_t& y) const { | 
|  | const double alpha = 1.0; | 
|  | const double beta = 1.0; | 
|  |  | 
|  | CHECK_EQ(cusparseSpMV(context_->cusparse_handle_, | 
|  | op, | 
|  | &alpha, | 
|  | descr_, | 
|  | x, | 
|  | &beta, | 
|  | y, | 
|  | CUDA_R_64F, | 
|  | kSpMVAlgorithm, | 
|  | spmv_buffer_.data()), | 
|  | CUSPARSE_STATUS_SUCCESS); | 
|  | } | 
|  |  | 
|  | void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x, | 
|  | CudaVector* y) const { | 
|  | DCHECK(GetTempBufferSize( | 
|  | context_->cusparse_handle_, y->descr(), x.descr(), descr_) <= | 
|  | spmv_buffer_.size()); | 
|  | SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x.descr(), y->descr()); | 
|  | } | 
|  |  | 
|  | void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x, | 
|  | CudaVector* y) const { | 
|  | // TODO(Joydeep Biswas): We should consider storing a transposed copy of the | 
|  | // matrix by converting CSR to CSC. From the cuSPARSE documentation: | 
|  | // "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA | 
|  | // != CUSPARSE_OPERATION_NON_TRANSPOSE" | 
|  | DCHECK(GetTempBufferSize( | 
|  | context_->cusparse_handle_, x.descr(), y->descr(), descr_) <= | 
|  | spmv_buffer_.size()); | 
|  | SpMv(CUSPARSE_OPERATION_TRANSPOSE, x.descr(), y->descr()); | 
|  | } | 
|  |  | 
|  | }  // namespace ceres::internal | 
|  |  | 
|  | #endif  // CERES_NO_CUDA |