| // Ceres Solver - A fast non-linear least squares minimizer |
| // Copyright 2023 Google Inc. All rights reserved. |
| // http://ceres-solver.org/ |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are met: |
| // |
| // * Redistributions of source code must retain the above copyright notice, |
| // this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above copyright notice, |
| // this list of conditions and the following disclaimer in the documentation |
| // and/or other materials provided with the distribution. |
| // * Neither the name of Google Inc. nor the names of its contributors may be |
| // used to endorse or promote products derived from this software without |
| // specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| // POSSIBILITY OF SUCH DAMAGE. |
| // |
| // Author: joydeepb@cs.utexas.edu (Joydeep Biswas) |
| // |
| // A CUDA sparse matrix linear operator. |
| |
| // This include must come before any #ifndef check on Ceres compile options. |
| // clang-format off |
| #include "ceres/internal/config.h" |
| // clang-format on |
| |
| #include "ceres/cuda_sparse_matrix.h" |
| |
| #include <math.h> |
| |
| #include <memory> |
| |
| #include "absl/log/check.h" |
| #include "ceres/block_sparse_matrix.h" |
| #include "ceres/compressed_row_sparse_matrix.h" |
| #include "ceres/context_impl.h" |
| #include "ceres/crs_matrix.h" |
| #include "ceres/internal/export.h" |
| #include "ceres/types.h" |
| |
| #ifndef CERES_NO_CUDA |
| |
| #include "ceres/cuda_buffer.h" |
| #include "ceres/cuda_kernels_vector_ops.h" |
| #include "ceres/cuda_vector.h" |
| #include "cuda_runtime_api.h" |
| #include "cusparse.h" |
| |
| namespace ceres::internal { |
| namespace { |
| // Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of |
| // CUSPARSE_SPMV_ALG_DEFAULT. |
| #if CUDART_VERSION >= 11021 |
| const auto kSpMVAlgorithm = CUSPARSE_SPMV_ALG_DEFAULT; |
| #else // CUDART_VERSION >= 11021 |
| const auto kSpMVAlgorithm = CUSPARSE_MV_ALG_DEFAULT; |
| #endif // CUDART_VERSION >= 11021 |
| size_t GetTempBufferSizeForOp(const cusparseHandle_t& handle, |
| const cusparseOperation_t op, |
| const cusparseDnVecDescr_t& x, |
| const cusparseDnVecDescr_t& y, |
| const cusparseSpMatDescr_t& A) { |
| size_t buffer_size; |
| const double alpha = 1.0; |
| const double beta = 1.0; |
| CHECK_NE(A, nullptr); |
| CHECK_EQ(cusparseSpMV_bufferSize(handle, |
| op, |
| &alpha, |
| A, |
| x, |
| &beta, |
| y, |
| CUDA_R_64F, |
| kSpMVAlgorithm, |
| &buffer_size), |
| CUSPARSE_STATUS_SUCCESS); |
| return buffer_size; |
| } |
| |
| size_t GetTempBufferSize(const cusparseHandle_t& handle, |
| const cusparseDnVecDescr_t& left, |
| const cusparseDnVecDescr_t& right, |
| const cusparseSpMatDescr_t& A) { |
| CHECK_NE(A, nullptr); |
| return std::max(GetTempBufferSizeForOp( |
| handle, CUSPARSE_OPERATION_NON_TRANSPOSE, right, left, A), |
| GetTempBufferSizeForOp( |
| handle, CUSPARSE_OPERATION_TRANSPOSE, left, right, A)); |
| } |
| } // namespace |
| |
| CudaSparseMatrix::CudaSparseMatrix(int num_cols, |
| CudaBuffer<int32_t>&& rows, |
| CudaBuffer<int32_t>&& cols, |
| ContextImpl* context) |
| : num_rows_(rows.size() - 1), |
| num_cols_(num_cols), |
| num_nonzeros_(cols.size()), |
| context_(context), |
| rows_(std::move(rows)), |
| cols_(std::move(cols)), |
| values_(context, num_nonzeros_), |
| spmv_buffer_(context) { |
| Initialize(); |
| } |
| |
| CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context, |
| const CompressedRowSparseMatrix& crs_matrix) |
| : num_rows_(crs_matrix.num_rows()), |
| num_cols_(crs_matrix.num_cols()), |
| num_nonzeros_(crs_matrix.num_nonzeros()), |
| context_(context), |
| rows_(context, num_rows_ + 1), |
| cols_(context, num_nonzeros_), |
| values_(context, num_nonzeros_), |
| spmv_buffer_(context) { |
| rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1); |
| cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_); |
| values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_); |
| Initialize(); |
| } |
| |
| CudaSparseMatrix::~CudaSparseMatrix() { |
| CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS); |
| descr_ = nullptr; |
| CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_left_)); |
| CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_right_)); |
| } |
| |
| void CudaSparseMatrix::CopyValuesFromCpu( |
| const CompressedRowSparseMatrix& crs_matrix) { |
| // There is no quick and easy way to verify that the structure is unchanged, |
| // but at least we can check that the size of the matrix and the number of |
| // nonzeros is unchanged. |
| CHECK_EQ(num_rows_, crs_matrix.num_rows()); |
| CHECK_EQ(num_cols_, crs_matrix.num_cols()); |
| CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros()); |
| values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_); |
| } |
| |
| void CudaSparseMatrix::Initialize() { |
| CHECK(context_->IsCudaInitialized()); |
| CHECK_EQ(CUSPARSE_STATUS_SUCCESS, |
| cusparseCreateCsr(&descr_, |
| num_rows_, |
| num_cols_, |
| num_nonzeros_, |
| rows_.data(), |
| cols_.data(), |
| values_.data(), |
| CUSPARSE_INDEX_32I, |
| CUSPARSE_INDEX_32I, |
| CUSPARSE_INDEX_BASE_ZERO, |
| CUDA_R_64F)); |
| |
| // Note: values_.data() is used as non-zero pointer to device memory |
| // When there is no non-zero values, data-pointer of values_ array will be a |
| // nullptr; but in this case left/right products are trivial and temporary |
| // buffer (and vector descriptors) is not required |
| if (!num_nonzeros_) return; |
| |
| CHECK_EQ(CUSPARSE_STATUS_SUCCESS, |
| cusparseCreateDnVec( |
| &descr_vec_left_, num_rows_, values_.data(), CUDA_R_64F)); |
| CHECK_EQ(CUSPARSE_STATUS_SUCCESS, |
| cusparseCreateDnVec( |
| &descr_vec_right_, num_cols_, values_.data(), CUDA_R_64F)); |
| size_t buffer_size = GetTempBufferSize( |
| context_->cusparse_handle_, descr_vec_left_, descr_vec_right_, descr_); |
| spmv_buffer_.Reserve(buffer_size); |
| } |
| |
| void CudaSparseMatrix::SpMv(cusparseOperation_t op, |
| const cusparseDnVecDescr_t& x, |
| const cusparseDnVecDescr_t& y) const { |
| const double alpha = 1.0; |
| const double beta = 1.0; |
| |
| CHECK_EQ(cusparseSpMV(context_->cusparse_handle_, |
| op, |
| &alpha, |
| descr_, |
| x, |
| &beta, |
| y, |
| CUDA_R_64F, |
| kSpMVAlgorithm, |
| spmv_buffer_.data()), |
| CUSPARSE_STATUS_SUCCESS); |
| } |
| |
| void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x, |
| CudaVector* y) const { |
| DCHECK(GetTempBufferSize( |
| context_->cusparse_handle_, y->descr(), x.descr(), descr_) <= |
| spmv_buffer_.size()); |
| SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x.descr(), y->descr()); |
| } |
| |
| void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x, |
| CudaVector* y) const { |
| // TODO(Joydeep Biswas): We should consider storing a transposed copy of the |
| // matrix by converting CSR to CSC. From the cuSPARSE documentation: |
| // "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA |
| // != CUSPARSE_OPERATION_NON_TRANSPOSE" |
| DCHECK(GetTempBufferSize( |
| context_->cusparse_handle_, x.descr(), y->descr(), descr_) <= |
| spmv_buffer_.size()); |
| SpMv(CUSPARSE_OPERATION_TRANSPOSE, x.descr(), y->descr()); |
| } |
| |
| } // namespace ceres::internal |
| |
| #endif // CERES_NO_CUDA |