| // Ceres Solver - A fast non-linear least squares minimizer | 
 | // Copyright 2022 Google Inc. All rights reserved. | 
 | // http://ceres-solver.org/ | 
 | // | 
 | // Redistribution and use in source and binary forms, with or without | 
 | // modification, are permitted provided that the following conditions are met: | 
 | // | 
 | // * Redistributions of source code must retain the above copyright notice, | 
 | //   this list of conditions and the following disclaimer. | 
 | // * Redistributions in binary form must reproduce the above copyright notice, | 
 | //   this list of conditions and the following disclaimer in the documentation | 
 | //   and/or other materials provided with the distribution. | 
 | // * Neither the name of Google Inc. nor the names of its contributors may be | 
 | //   used to endorse or promote products derived from this software without | 
 | //   specific prior written permission. | 
 | // | 
 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
 | // POSSIBILITY OF SUCH DAMAGE. | 
 | // | 
 | // Author: joydeepb@cs.utexas.edu (Joydeep Biswas) | 
 | // | 
 | // A CUDA sparse matrix linear operator. | 
 |  | 
 | // This include must come before any #ifndef check on Ceres compile options. | 
 | // clang-format off | 
 | #include "ceres/internal/config.h" | 
 | // clang-format on | 
 |  | 
 | #include "ceres/cuda_sparse_matrix.h" | 
 |  | 
 | #include <math.h> | 
 |  | 
 | #include <memory> | 
 |  | 
 | #include "ceres/block_sparse_matrix.h" | 
 | #include "ceres/compressed_row_sparse_matrix.h" | 
 | #include "ceres/context_impl.h" | 
 | #include "ceres/crs_matrix.h" | 
 | #include "ceres/internal/export.h" | 
 | #include "ceres/types.h" | 
 | #include "ceres/wall_time.h" | 
 |  | 
 | #ifndef CERES_NO_CUDA | 
 |  | 
 | #include "ceres/cuda_buffer.h" | 
 | #include "ceres/cuda_kernels.h" | 
 | #include "ceres/cuda_vector.h" | 
 | #include "cuda_runtime_api.h" | 
 | #include "cusparse.h" | 
 |  | 
 | namespace ceres::internal { | 
 |  | 
 | CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context, | 
 |                                    const CompressedRowSparseMatrix& crs_matrix) | 
 |     : context_(context), | 
 |       rows_{context}, | 
 |       cols_{context}, | 
 |       values_{context}, | 
 |       spmv_buffer_{context} { | 
 |   DCHECK_NE(context, nullptr); | 
 |   CHECK(context->IsCudaInitialized()); | 
 |   num_rows_ = crs_matrix.num_rows(); | 
 |   num_cols_ = crs_matrix.num_cols(); | 
 |   num_nonzeros_ = crs_matrix.num_nonzeros(); | 
 |   rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1); | 
 |   cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_); | 
 |   values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_); | 
 |   cusparseCreateCsr(&descr_, | 
 |                     num_rows_, | 
 |                     num_cols_, | 
 |                     num_nonzeros_, | 
 |                     rows_.data(), | 
 |                     cols_.data(), | 
 |                     values_.data(), | 
 |                     CUSPARSE_INDEX_32I, | 
 |                     CUSPARSE_INDEX_32I, | 
 |                     CUSPARSE_INDEX_BASE_ZERO, | 
 |                     CUDA_R_64F); | 
 | } | 
 |  | 
 | CudaSparseMatrix::~CudaSparseMatrix() { | 
 |   CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS); | 
 |   descr_ = nullptr; | 
 | } | 
 |  | 
 | void CudaSparseMatrix::CopyValuesFromCpu( | 
 |     const CompressedRowSparseMatrix& crs_matrix) { | 
 |   // There is no quick and easy way to verify that the structure is unchanged, | 
 |   // but at least we can check that the size of the matrix and the number of | 
 |   // nonzeros is unchanged. | 
 |   CHECK_EQ(num_rows_, crs_matrix.num_rows()); | 
 |   CHECK_EQ(num_cols_, crs_matrix.num_cols()); | 
 |   CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros()); | 
 |   values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_); | 
 | } | 
 |  | 
 | void CudaSparseMatrix::SpMv(cusparseOperation_t op, | 
 |                             const CudaVector& x, | 
 |                             CudaVector* y) { | 
 |   size_t buffer_size = 0; | 
 |   const double alpha = 1.0; | 
 |   const double beta = 1.0; | 
 |  | 
 |   // Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of | 
 |   // CUSPARSE_SPMV_ALG_DEFAULT. | 
 | #if CUDART_VERSION >= 11021 | 
 |   const auto algorithm = CUSPARSE_SPMV_ALG_DEFAULT; | 
 | #else   // CUDART_VERSION >= 11021 | 
 |   const auto algorithm = CUSPARSE_MV_ALG_DEFAULT; | 
 | #endif  // CUDART_VERSION >= 11021 | 
 |  | 
 |   CHECK_EQ(cusparseSpMV_bufferSize(context_->cusparse_handle_, | 
 |                                    op, | 
 |                                    &alpha, | 
 |                                    descr_, | 
 |                                    x.descr(), | 
 |                                    &beta, | 
 |                                    y->descr(), | 
 |                                    CUDA_R_64F, | 
 |                                    algorithm, | 
 |                                    &buffer_size), | 
 |            CUSPARSE_STATUS_SUCCESS); | 
 |   spmv_buffer_.Reserve(buffer_size); | 
 |   CHECK_EQ(cusparseSpMV(context_->cusparse_handle_, | 
 |                         op, | 
 |                         &alpha, | 
 |                         descr_, | 
 |                         x.descr(), | 
 |                         &beta, | 
 |                         y->descr(), | 
 |                         CUDA_R_64F, | 
 |                         algorithm, | 
 |                         spmv_buffer_.data()), | 
 |            CUSPARSE_STATUS_SUCCESS); | 
 | } | 
 |  | 
 | void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x, | 
 |                                                   CudaVector* y) { | 
 |   SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x, y); | 
 | } | 
 |  | 
 | void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x, | 
 |                                                  CudaVector* y) { | 
 |   // TODO(Joydeep Biswas): We should consider storing a transposed copy of the | 
 |   // matrix by converting CSR to CSC. From the cuSPARSE documentation: | 
 |   // "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA | 
 |   // != CUSPARSE_OPERATION_NON_TRANSPOSE" | 
 |   SpMv(CUSPARSE_OPERATION_TRANSPOSE, x, y); | 
 | } | 
 |  | 
 | }  // namespace ceres::internal | 
 |  | 
 | #endif  // CERES_NO_CUDA |