internal/ceres/cuda_sparse_matrix.cc - ceres-solver - Git at Google

 // Ceres Solver - A fast non-linear least squares minimizer
 // Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // * Redistributions of source code must retain the above copyright notice,
 //   this list of conditions and the following disclaimer.
 // * Redistributions in binary form must reproduce the above copyright notice,
 //   this list of conditions and the following disclaimer in the documentation
 //   and/or other materials provided with the distribution.
 // * Neither the name of Google Inc. nor the names of its contributors may be
 //   used to endorse or promote products derived from this software without
 //   specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
 //
 // A CUDA sparse matrix linear operator.

 // This include must come before any #ifndef check on Ceres compile options.
 // clang-format off
 #include "ceres/internal/config.h"
 // clang-format on

 #include "ceres/cuda_sparse_matrix.h"

 #include <math.h>

 #include <memory>

 #include "ceres/block_sparse_matrix.h"
 #include "ceres/compressed_row_sparse_matrix.h"
 #include "ceres/context_impl.h"
 #include "ceres/crs_matrix.h"
 #include "ceres/internal/export.h"
 #include "ceres/types.h"
 #include "ceres/wall_time.h"

 #ifndef CERES_NO_CUDA

 #include "ceres/cuda_buffer.h"
 #include "ceres/cuda_kernels_vector_ops.h"
 #include "ceres/cuda_vector.h"
 #include "cuda_runtime_api.h"
 #include "cusparse.h"

 namespace ceres::internal {
 namespace {
 // Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of
 // CUSPARSE_SPMV_ALG_DEFAULT.
 #if CUDART_VERSION >= 11021
 const auto kSpMVAlgorithm = CUSPARSE_SPMV_ALG_DEFAULT;
 #else   // CUDART_VERSION >= 11021
 const auto kSpMVAlgorithm = CUSPARSE_MV_ALG_DEFAULT;
 #endif  // CUDART_VERSION >= 11021
 size_t GetTempBufferSizeForOp(const cusparseHandle_t& handle,
                               const cusparseOperation_t op,
                               const cusparseDnVecDescr_t& x,
                               const cusparseDnVecDescr_t& y,
                               const cusparseSpMatDescr_t& A) {
   size_t buffer_size;
   const double alpha = 1.0;
   const double beta = 1.0;
   CHECK_NE(A, nullptr);
   CHECK_EQ(cusparseSpMV_bufferSize(handle,
                                    op,
                                    &alpha,
                                    A,
                                    x,
                                    &beta,
                                    y,
                                    CUDA_R_64F,
                                    kSpMVAlgorithm,
                                    &buffer_size),
            CUSPARSE_STATUS_SUCCESS);
   return buffer_size;
 }

 size_t GetTempBufferSize(const cusparseHandle_t& handle,
                          const cusparseDnVecDescr_t& left,
                          const cusparseDnVecDescr_t& right,
                          const cusparseSpMatDescr_t& A) {
   CHECK_NE(A, nullptr);
   return std::max(GetTempBufferSizeForOp(
                       handle, CUSPARSE_OPERATION_NON_TRANSPOSE, right, left, A),
                   GetTempBufferSizeForOp(
                       handle, CUSPARSE_OPERATION_TRANSPOSE, left, right, A));
 }
 }  // namespace

 CudaSparseMatrix::CudaSparseMatrix(int num_cols,
                                    CudaBuffer<int32_t>&& rows,
                                    CudaBuffer<int32_t>&& cols,
                                    ContextImpl* context)
     : num_rows_(rows.size() - 1),
       num_cols_(num_cols),
       num_nonzeros_(cols.size()),
       context_(context),
       rows_(std::move(rows)),
       cols_(std::move(cols)),
       values_(context, num_nonzeros_),
       spmv_buffer_(context) {
   Initialize();
 }

 CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context,
                                    const CompressedRowSparseMatrix& crs_matrix)
     : num_rows_(crs_matrix.num_rows()),
       num_cols_(crs_matrix.num_cols()),
       num_nonzeros_(crs_matrix.num_nonzeros()),
       context_(context),
       rows_(context, num_rows_ + 1),
       cols_(context, num_nonzeros_),
       values_(context, num_nonzeros_),
       spmv_buffer_(context) {
   rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1);
   cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_);
   values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
   Initialize();
 }

 CudaSparseMatrix::~CudaSparseMatrix() {
   CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS);
   descr_ = nullptr;
   CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_left_));
   CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_right_));
 }

 void CudaSparseMatrix::CopyValuesFromCpu(
     const CompressedRowSparseMatrix& crs_matrix) {
   // There is no quick and easy way to verify that the structure is unchanged,
   // but at least we can check that the size of the matrix and the number of
   // nonzeros is unchanged.
   CHECK_EQ(num_rows_, crs_matrix.num_rows());
   CHECK_EQ(num_cols_, crs_matrix.num_cols());
   CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros());
   values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
 }

 void CudaSparseMatrix::Initialize() {
   CHECK(context_->IsCudaInitialized());
   CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
            cusparseCreateCsr(&descr_,
                              num_rows_,
                              num_cols_,
                              num_nonzeros_,
                              rows_.data(),
                              cols_.data(),
                              values_.data(),
                              CUSPARSE_INDEX_32I,
                              CUSPARSE_INDEX_32I,
                              CUSPARSE_INDEX_BASE_ZERO,
                              CUDA_R_64F));

   // Note: values_.data() is used as non-zero pointer to device memory
   // When there is no non-zero values, data-pointer of values_ array will be a
   // nullptr; but in this case left/right products are trivial and temporary
   // buffer (and vector descriptors) is not required
   if (!num_nonzeros_) return;

   CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
            cusparseCreateDnVec(
                &descr_vec_left_, num_rows_, values_.data(), CUDA_R_64F));
   CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
            cusparseCreateDnVec(
                &descr_vec_right_, num_cols_, values_.data(), CUDA_R_64F));
   size_t buffer_size = GetTempBufferSize(
       context_->cusparse_handle_, descr_vec_left_, descr_vec_right_, descr_);
   spmv_buffer_.Reserve(buffer_size);
 }

 void CudaSparseMatrix::SpMv(cusparseOperation_t op,
                             const cusparseDnVecDescr_t& x,
                             const cusparseDnVecDescr_t& y) const {
   const double alpha = 1.0;
   const double beta = 1.0;

   CHECK_EQ(cusparseSpMV(context_->cusparse_handle_,
                         op,
                         &alpha,
                         descr_,
                         x,
                         &beta,
                         y,
                         CUDA_R_64F,
                         kSpMVAlgorithm,
                         spmv_buffer_.data()),
            CUSPARSE_STATUS_SUCCESS);
 }

 void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x,
                                                   CudaVector* y) const {
   DCHECK(GetTempBufferSize(
              context_->cusparse_handle_, y->descr(), x.descr(), descr_) <=
          spmv_buffer_.size());
   SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x.descr(), y->descr());
 }

 void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x,
                                                  CudaVector* y) const {
   // TODO(Joydeep Biswas): We should consider storing a transposed copy of the
   // matrix by converting CSR to CSC. From the cuSPARSE documentation:
   // "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA
   // != CUSPARSE_OPERATION_NON_TRANSPOSE"
   DCHECK(GetTempBufferSize(
              context_->cusparse_handle_, x.descr(), y->descr(), descr_) <=
          spmv_buffer_.size());
   SpMv(CUSPARSE_OPERATION_TRANSPOSE, x.descr(), y->descr());
 }

 }  // namespace ceres::internal

 #endif  // CERES_NO_CUDA
	// Ceres Solver - A fast non-linear least squares minimizer
	// Copyright 2023 Google Inc. All rights reserved.
	// http://ceres-solver.org/
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are met:
	//
	// * Redistributions of source code must retain the above copyright notice,
	// this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above copyright notice,
	// this list of conditions and the following disclaimer in the documentation
	// and/or other materials provided with the distribution.
	// * Neither the name of Google Inc. nor the names of its contributors may be
	// used to endorse or promote products derived from this software without
	// specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.
	//
	// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
	//
	// A CUDA sparse matrix linear operator.

	// This include must come before any #ifndef check on Ceres compile options.
	// clang-format off
	#include "ceres/internal/config.h"
	// clang-format on

	#include "ceres/cuda_sparse_matrix.h"

	#include <math.h>

	#include <memory>

	#include "ceres/block_sparse_matrix.h"
	#include "ceres/compressed_row_sparse_matrix.h"
	#include "ceres/context_impl.h"
	#include "ceres/crs_matrix.h"
	#include "ceres/internal/export.h"
	#include "ceres/types.h"
	#include "ceres/wall_time.h"

	#ifndef CERES_NO_CUDA

	#include "ceres/cuda_buffer.h"
	#include "ceres/cuda_kernels_vector_ops.h"
	#include "ceres/cuda_vector.h"
	#include "cuda_runtime_api.h"
	#include "cusparse.h"

	namespace ceres::internal {
	namespace {
	// Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of
	// CUSPARSE_SPMV_ALG_DEFAULT.
	#if CUDART_VERSION >= 11021
	const auto kSpMVAlgorithm = CUSPARSE_SPMV_ALG_DEFAULT;
	#else // CUDART_VERSION >= 11021
	const auto kSpMVAlgorithm = CUSPARSE_MV_ALG_DEFAULT;
	#endif // CUDART_VERSION >= 11021
	size_t GetTempBufferSizeForOp(const cusparseHandle_t& handle,
	const cusparseOperation_t op,
	const cusparseDnVecDescr_t& x,
	const cusparseDnVecDescr_t& y,
	const cusparseSpMatDescr_t& A) {
	size_t buffer_size;
	const double alpha = 1.0;
	const double beta = 1.0;
	CHECK_NE(A, nullptr);
	CHECK_EQ(cusparseSpMV_bufferSize(handle,
	op,
	&alpha,
	A,
	x,
	&beta,
	y,
	CUDA_R_64F,
	kSpMVAlgorithm,
	&buffer_size),
	CUSPARSE_STATUS_SUCCESS);
	return buffer_size;
	}

	size_t GetTempBufferSize(const cusparseHandle_t& handle,
	const cusparseDnVecDescr_t& left,
	const cusparseDnVecDescr_t& right,
	const cusparseSpMatDescr_t& A) {
	CHECK_NE(A, nullptr);
	return std::max(GetTempBufferSizeForOp(
	handle, CUSPARSE_OPERATION_NON_TRANSPOSE, right, left, A),
	GetTempBufferSizeForOp(
	handle, CUSPARSE_OPERATION_TRANSPOSE, left, right, A));
	}
	} // namespace

	CudaSparseMatrix::CudaSparseMatrix(int num_cols,
	CudaBuffer<int32_t>&& rows,
	CudaBuffer<int32_t>&& cols,
	ContextImpl* context)
	: num_rows_(rows.size() - 1),
	num_cols_(num_cols),
	num_nonzeros_(cols.size()),
	context_(context),
	rows_(std::move(rows)),
	cols_(std::move(cols)),
	values_(context, num_nonzeros_),
	spmv_buffer_(context) {
	Initialize();
	}

	CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context,
	const CompressedRowSparseMatrix& crs_matrix)
	: num_rows_(crs_matrix.num_rows()),
	num_cols_(crs_matrix.num_cols()),
	num_nonzeros_(crs_matrix.num_nonzeros()),
	context_(context),
	rows_(context, num_rows_ + 1),
	cols_(context, num_nonzeros_),
	values_(context, num_nonzeros_),
	spmv_buffer_(context) {
	rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1);
	cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_);
	values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
	Initialize();
	}

	CudaSparseMatrix::~CudaSparseMatrix() {
	CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS);
	descr_ = nullptr;
	CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_left_));
	CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_right_));
	}

	void CudaSparseMatrix::CopyValuesFromCpu(
	const CompressedRowSparseMatrix& crs_matrix) {
	// There is no quick and easy way to verify that the structure is unchanged,
	// but at least we can check that the size of the matrix and the number of
	// nonzeros is unchanged.
	CHECK_EQ(num_rows_, crs_matrix.num_rows());
	CHECK_EQ(num_cols_, crs_matrix.num_cols());
	CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros());
	values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
	}

	void CudaSparseMatrix::Initialize() {
	CHECK(context_->IsCudaInitialized());
	CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
	cusparseCreateCsr(&descr_,
	num_rows_,
	num_cols_,
	num_nonzeros_,
	rows_.data(),
	cols_.data(),
	values_.data(),
	CUSPARSE_INDEX_32I,
	CUSPARSE_INDEX_32I,
	CUSPARSE_INDEX_BASE_ZERO,
	CUDA_R_64F));

	// Note: values_.data() is used as non-zero pointer to device memory
	// When there is no non-zero values, data-pointer of values_ array will be a
	// nullptr; but in this case left/right products are trivial and temporary
	// buffer (and vector descriptors) is not required
	if (!num_nonzeros_) return;

	CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
	cusparseCreateDnVec(
	&descr_vec_left_, num_rows_, values_.data(), CUDA_R_64F));
	CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
	cusparseCreateDnVec(
	&descr_vec_right_, num_cols_, values_.data(), CUDA_R_64F));
	size_t buffer_size = GetTempBufferSize(
	context_->cusparse_handle_, descr_vec_left_, descr_vec_right_, descr_);
	spmv_buffer_.Reserve(buffer_size);
	}

	void CudaSparseMatrix::SpMv(cusparseOperation_t op,
	const cusparseDnVecDescr_t& x,
	const cusparseDnVecDescr_t& y) const {
	const double alpha = 1.0;
	const double beta = 1.0;

	CHECK_EQ(cusparseSpMV(context_->cusparse_handle_,
	op,
	&alpha,
	descr_,
	x,
	&beta,
	y,
	CUDA_R_64F,
	kSpMVAlgorithm,
	spmv_buffer_.data()),
	CUSPARSE_STATUS_SUCCESS);
	}

	void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x,
	CudaVector* y) const {
	DCHECK(GetTempBufferSize(
	context_->cusparse_handle_, y->descr(), x.descr(), descr_) <=
	spmv_buffer_.size());
	SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x.descr(), y->descr());
	}

	void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x,
	CudaVector* y) const {
	// TODO(Joydeep Biswas): We should consider storing a transposed copy of the
	// matrix by converting CSR to CSC. From the cuSPARSE documentation:
	// "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA
	// != CUSPARSE_OPERATION_NON_TRANSPOSE"
	DCHECK(GetTempBufferSize(
	context_->cusparse_handle_, x.descr(), y->descr(), descr_) <=
	spmv_buffer_.size());
	SpMv(CUSPARSE_OPERATION_TRANSPOSE, x.descr(), y->descr());
	}

	} // namespace ceres::internal

	#endif // CERES_NO_CUDA